In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.callbacks import EarlyStopping

In [2]:
full_df = pd.read_csv('data/affcon_final.csv')

In [3]:
full_df.head()

Unnamed: 0,Input.convo_id,Input.train_test_val,Input.msg_id,Input.timestamp,Input.full_text,affcon_gamemove,affcon_reasoning,affcon_rapport,affcon_shareinformation,Input.speaker,...,Input.relative_message_index,Input.year,Input.game_score_speaker,Input.game_score_receiver,Input.game_score_delta,Input.deception_quadrant,Input.num_words,Input.num_characters,Input.sno,Input.sno1
0,Game7-turkey-austria,Train,Game7-turkey-austria-9,197,Im moving my fleet to Alb not for Greece but f...,1,1,1,1,austria-Game7,...,9,1901,3,3,0,Straightforward,12,56,32,16.0
1,Game10-england-russia,Train,Game10-england-russia-36,722,If you take action in the spring to fight Aust...,1,1,1,1,england-Game10,...,36,1905,7,7,0,Straightforward,28,150,100,12.0
2,Game6-england-germany,Train,Game6-england-germany-86,469,This is to help prevent the ottoman advance.,0,1,0,1,england-Game6,...,86,1909,6,9,3,Straightforward,8,44,30,5.0
3,Game11-austria-italy,Validation,Game11-austria-italy-5,45,"And yes I would like peace on our front, I cou...",1,1,1,0,austria-Game11,...,5,1901,3,3,0,Straightforward,31,132,46,22.0
4,Game6-germany-russia,Train,Game6-germany-russia-3,37,"Just to be clear, England is not necessarily a...",1,0,0,1,russia-Game6,...,3,1901,4,5,1,Cassandra,27,148,63,21.0


In [4]:
# X = full_df['Input.full_text'].to_list()
# y = full_df['affcon_rapport'].tolist()

X = full_df['Input.full_text']
y = full_df['affcon_rapport']

le = LabelEncoder() # this can convert our categories into labels, make sure you don't have NaNs or Nulls in your data first
y = le.fit_transform(y)
print(y.shape)

# we reshape 
y = y.reshape(-1,1) # the -1 allows it to have whatever number went in there
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15738,)
(15738, 1)
(12590,)
(12590, 1)
(3148,)
(3148, 1)


In [10]:
X_train.map(len).max()

532

In [11]:
max_words = 600
max_len = 50

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)

sequences = tok.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences,maxlen=max_len)

In [12]:
Inp = Input(name='inputs',shape=[max_len])
x = Embedding(max_words,50,input_length=max_len)(Inp)
x = LSTM(64,name='LSTM_01')(x)
x = Dense(256,activation='relu',name='Dense_01')(x)
x = Dropout(0.2,name='Dropout')(x)
out = Dense(1,activation='sigmoid', name='output')(x)

In [13]:
model = Model(inputs=Inp,outputs=out)

In [14]:
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [15]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            30000     
_________________________________________________________________
LSTM_01 (LSTM)               (None, 64)                29440     
_________________________________________________________________
Dense_01 (Dense)             (None, 256)               16640     
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
output (Dense)               (None, 1)                 257       
Total params: 76,337
Trainable params: 76,337
Non-trainable params: 0
__________________________________________________

In [16]:
early_stop = EarlyStopping(monitor='val_loss',min_delta=0.0001)

In [17]:
model.fit(X_train,y_train,
          batch_size=128,
          epochs=10,
          validation_split=0.2,
          callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x7f41eeedff98>

In [18]:
test_sequences = tok.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [15]:
model.evaluate(X_test,y_test)



[0.658470094203949, 0.5952985882759094]