In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import RMSprop

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.callbacks import EarlyStopping

In [10]:
full_df = pd.read_csv('data/kokil dec 6 reprepare/conf_good_agg_withpc.csv')

In [11]:
full_df.head()

Unnamed: 0,HITId,Input.sentence_id,Input.convo_id,Input.train_test_val,Input.msg_id,Input.timestamp,Input.full_text,Input.speaker,Input.reply_to,Input.speaker_intention,...,Answer.3rapport.yes_pc_agree,Answer.4shareinformation.yes_pc_agree,Answer.1gamemove.yes_label,Answer.2reasoning.yes_label,Answer.3a_apologies.yes_label,Answer.3a_compliment.yes_label,Answer.3a_personalthoughts.yes_label,Answer.3a_reassurance.yes_label,Answer.3rapport.yes_label,Answer.4shareinformation.yes_label
0,301KG0KX9CLR06T6MC6UVPAHBC92HU,22056,Game7-turkey-austria,Train,Game7-turkey-austria-9,197,Im moving my fleet to Alb not for Greece but f...,austria-Game7,Game7-turkey-austria-8,Truth,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,301KG0KX9CLR06T6MC6UVPAHBCAH2A,6906,Game11-austria-italy,Validation,Game11-austria-italy-5,45,"And yes I would like peace on our front, I cou...",austria-Game11,Game11-austria-italy-4,Truth,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,301KG0KX9CLR06T6MC6UVPAHBCC2HX,3066,Game1-england-germany,Train,Game1-england-germany-271,1468,"okay...well, as the person who has ever seen a...",germany-Game1,Game1-england-germany-270,Truth,...,1.0,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,301KG0KX9CLR06T6MC6UVPAHBCCH2C,24093,Game9-italy-germany,Train,Game9-italy-germany-70,1460,I think the best thing we can do to keep the a...,germany-Game9,Game9-italy-germany-69,Truth,...,0.75,0.75,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,301KG0KX9CLR06T6MC6UVPAHBCD2HY,1591,Game1-england-italy,Train,Game1-england-italy-273,1809,We'll see if I can keep it friendly.,england-Game1,Game1-england-italy-272,Truth,...,0.75,0.75,1.0,1.0,1.0,1.0,,,1.0,1.0


In [12]:
full_df_length = full_df.shape[0]
full_df = full_df.dropna() # dataset contains NaN values, dropping NaNs here

X = full_df['Input.full_text']

print("Dropped {} rows with NaN".format(full_df_length - X.shape[0]))

# full_df["Input.deception_quadrant"] = full_df["Input.deception_quadrant"].apply(lambda x : 1 if x == "Straightforward" else 0)
y = full_df['Answer.4shareinformation.yes_label']

le = LabelEncoder() # this can convert our categories into labels, make sure you don't have NaNs or Nulls in your data first
y = le.fit_transform(y)

# we reshape 
y = y.reshape(-1,1) # the -1 allows it to have whatever number went in there

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Dropped 718 rows with NaN
(2636,)
(2636, 1)
(659,)
(659, 1)


In [13]:
max_words = 1000
max_len = 150

tok = Tokenizer(num_words=max_words, oov_token=True)
tok.fit_on_texts(X_train)

sequences = tok.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences,maxlen=max_len)
# X_train = sequence.pad_sequences(sequences)

In [14]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## LSTM Model

In [15]:
Inp = Input(name='inputs',shape=[max_len])
x = Embedding(max_words,100,input_length=max_len)(Inp)
x = LSTM(64,name='LSTM_01')(x)
x = Dropout(0.3,name='Dropout')(x)
x = Dense(128,activation='relu',name='Dense_01')(x)
x = Dense(64,activation='relu',name='Dense_02')(x)
# x = Dropout(0.5,name='Dropout')(x)
out = Dense(1,activation='sigmoid', name='output')(x)

model = Model(inputs=Inp,outputs=out)

model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])

model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 100)          100000    
_________________________________________________________________
LSTM_01 (LSTM)               (None, 64)                42240     
_________________________________________________________________
Dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
Dense_01 (Dense)             (None, 128)               8320      
_________________________________________________________________
Dense_02 (Dense)             (None, 64)                8256      
_________________________________________________________________
output (Dense)               (None, 1)                

In [16]:
early_stop = EarlyStopping(monitor='val_loss',min_delta=0.00001)

# model.fit(X_train,y_train,
#           batch_size=128,
#           epochs=15,
#           validation_split=0.2,
#           callbacks=[early_stop])

model.fit(X_train,y_train,
          batch_size=128,
          epochs=30,
          validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f403f4c4fd0>

In [17]:
test_sequences_LSTM = tok.texts_to_sequences(X_test)
X_test_LSTM = sequence.pad_sequences(test_sequences_LSTM,maxlen=max_len)
model.evaluate(X_test_LSTM,y_test)



[2.355189085006714,
 0.7572078704833984,
 0.8580648303031921,
 0.8386263251304626,
 0.8818392753601074]

## CNN Model

In [10]:
model_CNN = Sequential(name="CNN_with_embeddings")
model_CNN.add(Embedding(max_words, 100, input_length=max_len))
model_CNN.add(Conv1D(filters=128, kernel_size=10, activation='relu'))
model_CNN.add(MaxPooling1D(pool_size=2))
model_CNN.add(Flatten())
model_CNN.add(Dropout(0.5))
model_CNN.add(Dense(32, activation='relu'))
model_CNN.add(Dense(1, activation='sigmoid'))

model_CNN.compile(loss='binary_crossentropy', 
              optimizer= 'adam',
              metrics=['acc',f1_m,precision_m, recall_m])

model_CNN.summary()

Model: "CNN_with_embeddings"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 100)          100000    
_________________________________________________________________
conv1d (Conv1D)              (None, 141, 128)          128128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 70, 128)           0         
_________________________________________________________________
flatten (Flatten)            (None, 8960)              0         
_________________________________________________________________
dropout (Dropout)            (None, 8960)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                286752    
_________________________________________________________________
dense_1 (Dense)              (None, 1)         

In [11]:
early_stop = EarlyStopping(monitor='val_loss',min_delta=0.000001)

# model_CNN.fit(X_train,y_train,
#           batch_size=256,
#           epochs=15,
#           validation_split=0.2,
#           callbacks=[early_stop])

model_CNN.fit(X_train,y_train,
          batch_size=128,
          epochs=15,
          validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fa15c15ac88>

In [12]:
test_sequences_CNN = tok.texts_to_sequences(X_test)
X_test_CNN = sequence.pad_sequences(test_sequences_CNN,maxlen=max_len)

model_CNN.evaluate(X_test_CNN,y_test)



[1.1617460250854492,
 0.8209407925605774,
 0.8995769023895264,
 0.8714513182640076,
 0.9335665106773376]