# Model Training

Author Name: Saransh Srivastava 

Student ID: 1031073

We test our evidences based on two models. A base line logistic regression and a sequential artifitial neural network

In [None]:
import glob
import json
import pickle
import pandas as pd
import unicodedata


wiki_path = 'wiki-pages-text/*.txt'
train_path = 'train.json'

LOGISTIC_MODEL = "logRegModel.sav"
VECTORIZER = "Vectorizer.sav"
ENCODER = "Encoder.sav"
ANN_MODEL = "SequentialANN.sav"


#### Helper functions

In [None]:
def storeModel(model,filename):
    pickle.dump(model, open(filename, 'wb'))
    

#### Read wiki data into memory

In [None]:
files = glob.glob(wiki_path)
dict_doc = {}
for fname in files:
    with open(fname) as f:
        for text in f:
            line = text.split()
            pid = unicodedata.normalize('NFD',line[0])
            sid = line[1]
            sent = " ".join(line[2:])
            dict_doc[(pid,sid)] = sent

print(len(dict_doc.keys()))
with open(train_path) as f:
    jdata = json.load(f)


## Building training data for training of all the models

This needs to run once, it saves the model in a file after that which can be used

In [None]:
##############
### Building simple baseline logistic regression model
##############

###############
### Method : make duplicate claims andlabel each claim-evidence pair
##  Merge them into one output per claim with list of most dominant evidence
###############
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


df_list = []
for source in jdata.items():
    texts = []
    document = {}
    values = source[1]
    document['claim'] = values['claim']
    label = values['label']
    evidence = values['evidence']
    if label == 'SUPPORTS' or label == 'REFUTES':
        for evi in evidence:
            pid = unicodedata.normalize('NFD',evi[0])
            sid = str(evi[1])
            if (pid,sid) in dict_doc:
                document['claim'] += dict_doc[(pid,sid)]
                document['label'] = label
                df_list.append(document)
                document = {}
                document['claim'] = values['claim']
    else:
        document['label'] = label
        df_list.append(document)
        document = {}

df = pd.DataFrame(df_list)
print(df.label.unique())

sentences = [sent for sent in df['claim']]
# SUPPORTS = 2"
# REFUTES = 1
# NOT ENOUGH INFO = 0
labels = [sent for sent in df['label']]
y = []

encoder = LabelEncoder()
y = encoder.fit_transform(labels)

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.1, random_state=1000)

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_test



In [None]:
X_test.shape

## Train a logistic regression model

In [None]:

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',max_iter=1000)
classifier.fit(X_train, y_train)

score = classifier.score(X_train, y_train)
print("Accuracy:", score)

############
### Storing above logistic regression model
############

storeModel(classifier,LOGISTIC_MODEL)
storeModel(vectorizer,VECTORIZER)
storeModel(encoder,ENCODER)


## Building Sequential ANN model
This needs to be trained once, it will then be stored and used 

In [None]:
###############
## Sequencia ANN
###############

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM



input_dim = X_train.shape[1]  # Number of features
model = Sequential()
model.add(Dense(512,input_shape=(input_dim, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.metrics_names)

In [None]:
Fit the model using the training data and validate using the test set

In [None]:
batch_size = 12340
epochs = 7
history = model.fit(X_train,y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test,y_test))

storeModel(vectorizer,VECTORIZER)
storeModel(encoder,ENCODER)
storeModel(model,ANN_MODEL)

#### Loss vs Accuracy plot

This plot shows us that we have reached our minimum loss and maximum accuracy without overfitting our training data

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['acc'])
plt.show()