In [7]:
from __future__ import print_function
from IPython.display import display, HTML
import os
import sys
import csv
import time
import statistics
import numpy as np
import string
import re
import pandas as pd
import text_features_extractor as tfExtractor
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, recall_score
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, model_from_json
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import make_pipeline
import tensorflow
from lime import lime_text

### NN model for Chinese data

In [8]:
#Read in the csv
chinDataDf = pd.read_csv("data_with_features/chin_cleaned_data_f.csv", encoding='UTF-8')
#Only take the text and sentiment columns
chinDataDf = chinDataDf[['text', 'depressed']]
#Cleaning
for index, row in chinDataDf.iterrows():
    #Preprocessing
    chinText, engText = tfExtractor.splitChinEng(row['text'])
    text = tfExtractor.chinPreprocessing(chinText)
    chinDataDf.set_value(index,'text',text)
#Convert data to numpy array
X = np.array(chinDataDf['text'].tolist())
Y = np.array(chinDataDf['depressed'].tolist())
#Convert -1 label to 0
i = 0
for label in Y:
    if(label == -1):
        Y[i] = 0
    i += 1

In [9]:
#Original number of words: 46708
#Set top words
topWords = 5000
#Tokenizing the data
tokenizer = Tokenizer(num_words=topWords)
xString = []
for text in X:
    xString.append(' '.join(text))
tokenizer.fit_on_texts(xString)
print("tokenizer fitting is complete")
xSeq = tokenizer.texts_to_sequences(xString)
wordIndex = tokenizer.word_index
print("Number of words: " + str(len(wordIndex)))

tokenizer fitting is complete
Number of words: 31152


In [10]:
#Get review mean length
lengths = [len(i) for i in xSeq]
print("review mean length: " + str(np.mean(lengths)))
#Set maximum review length to cover at least 90% of review content
maxReviewLength = int(np.percentile(lengths, 90))
print("maximum review length: " + str(maxReviewLength))

#Set paddings for review data
xPadded = pad_sequences(xSeq, maxlen=maxReviewLength)
print("Done padding")

review mean length: 273.425373134
maximum review length: 583
Done padding


In [11]:
def creatNnModel():
    # Simple multilayer perceptron model
    model = Sequential()
    #Embedding layer
    model.add(Embedding(topWords, 128, input_length=maxReviewLength))
    #Flattening
    model.add(Flatten())
    #Hidden layer
    model.add(Dense(250, activation='relu'))
    #Output layer
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    return model

def createCnnModel():
    #CNN model
    cnnModel = Sequential()
    cnnModel.add(Embedding(topWords, 128, input_length=maxReviewLength))
    cnnModel.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
    cnnModel.add(MaxPooling1D(pool_size=2))
    cnnModel.add(Flatten())
    cnnModel.add(Dense(250, activation='relu'))
    cnnModel.add(Dense(1, activation='sigmoid'))
    cnnModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(cnnModel.summary())
    return cnnModel

In [12]:
nnAccuracy = []
nnPR = []
nnRecall = []
cnnAccuracy = []
cnnPR = []
cnnRecall = []
for i in range(10):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=i, stratify=Y)
    
    #Create model
    model = creatNnModel()
    cnnModel = createCnnModel()
    
    # Fit the model
    model.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=25, batch_size=50, verbose=2)
    # Final evaluation of the model
    scores = model.evaluate(xTest, yTest, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    yPred = model.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    average_precision = average_precision_score(yTest, yPred)
    nn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print('Average precision score: {0:0.2f}'.format(average_precision))
    print('Recall score: {0:0.2f}'.format(nn_recall_score))
    nnAccuracy.append(scores[1])
    nnPR.append(average_precision)
    nnRecall.append(nn_recall_score)
    
    # Fit the model
    cnnModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=25, batch_size=50, verbose=2)
    # Final evaluation of the model
    cnnscores = cnnModel.evaluate(xTest, yTest, verbose=0)
    yPred = cnnModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    cnn_average_precision = average_precision_score(yTest, yPred)
    cnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (cnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(cnn_average_precision))
    print('Recall score: {0:0.2f}'.format(cnn_recall_score))
    cnnAccuracy.append(cnnscores[1])
    cnnPR.append(cnn_average_precision)
    cnnRecall.append(cnn_recall_score)

Training session: 0
Train on 643 samples, validate on 161 samples
Epoch 1/25
12s - loss: 0.8270 - acc: 0.5770 - val_loss: 0.6496 - val_acc: 0.6646
Epoch 2/25
10s - loss: 0.4392 - acc: 0.7729 - val_loss: 0.6019 - val_acc: 0.6398
Epoch 3/25
10s - loss: 0.2847 - acc: 0.9269 - val_loss: 0.5451 - val_acc: 0.6832
Epoch 4/25
10s - loss: 0.1551 - acc: 0.9627 - val_loss: 0.5152 - val_acc: 0.7453
Epoch 5/25
10s - loss: 0.0855 - acc: 0.9844 - val_loss: 0.5201 - val_acc: 0.7391
Epoch 6/25
10s - loss: 0.0448 - acc: 0.9938 - val_loss: 0.5327 - val_acc: 0.7453
Epoch 7/25
10s - loss: 0.0254 - acc: 0.9984 - val_loss: 0.5269 - val_acc: 0.7329
Epoch 8/25
10s - loss: 0.0148 - acc: 0.9984 - val_loss: 0.5183 - val_acc: 0.7453
Epoch 9/25
10s - loss: 0.0097 - acc: 0.9984 - val_loss: 0.5323 - val_acc: 0.7453
Epoch 10/25
10s - loss: 0.0069 - acc: 1.0000 - val_loss: 0.5417 - val_acc: 0.7453
Epoch 11/25
10s - loss: 0.0051 - acc: 1.0000 - val_loss: 0.5307 - val_acc: 0.7764
Epoch 12/25
10s - loss: 0.0043 - acc: 1.0

7s - loss: 0.0039 - acc: 0.9953 - val_loss: 1.0114 - val_acc: 0.8323
Epoch 22/25
7s - loss: 0.0038 - acc: 0.9953 - val_loss: 1.0150 - val_acc: 0.8323
Epoch 23/25
7s - loss: 0.0037 - acc: 0.9953 - val_loss: 1.0189 - val_acc: 0.8323
Epoch 24/25
7s - loss: 0.0036 - acc: 0.9953 - val_loss: 1.0209 - val_acc: 0.8261
Epoch 25/25
10s - loss: 0.0035 - acc: 0.9953 - val_loss: 1.0231 - val_acc: 0.8261
Accuracy: 82.61%
Average precision score: 0.83
Recall score: 0.78
Training session: 2
Train on 643 samples, validate on 161 samples
Epoch 1/25
7s - loss: 1.3888 - acc: 0.5179 - val_loss: 0.6381 - val_acc: 0.6584
Epoch 2/25
7s - loss: 0.5915 - acc: 0.6796 - val_loss: 0.6308 - val_acc: 0.6584
Epoch 3/25
8s - loss: 0.5440 - acc: 0.6827 - val_loss: 0.6371 - val_acc: 0.6584
Epoch 4/25
7s - loss: 0.5013 - acc: 0.6936 - val_loss: 0.6314 - val_acc: 0.6584
Epoch 5/25
6s - loss: 0.4075 - acc: 0.8274 - val_loss: 0.5590 - val_acc: 0.7578
Epoch 6/25
6s - loss: 0.2839 - acc: 0.9176 - val_loss: 0.5253 - val_acc: 0

6s - loss: 0.0130 - acc: 0.9953 - val_loss: 1.4220 - val_acc: 0.7702
Epoch 18/25
6s - loss: 0.0124 - acc: 0.9953 - val_loss: 1.4277 - val_acc: 0.7640
Epoch 19/25
7s - loss: 0.0114 - acc: 0.9953 - val_loss: 1.4452 - val_acc: 0.7516
Epoch 20/25
6s - loss: 0.0104 - acc: 0.9953 - val_loss: 1.4641 - val_acc: 0.7640
Epoch 21/25
6s - loss: 0.0094 - acc: 0.9953 - val_loss: 1.4861 - val_acc: 0.7702
Epoch 22/25
6s - loss: 0.0084 - acc: 0.9953 - val_loss: 1.5101 - val_acc: 0.7702
Epoch 23/25
6s - loss: 0.0074 - acc: 0.9953 - val_loss: 1.5391 - val_acc: 0.7764
Epoch 24/25
7s - loss: 0.0070 - acc: 0.9953 - val_loss: 1.5664 - val_acc: 0.7764
Epoch 25/25
6s - loss: 0.0058 - acc: 0.9953 - val_loss: 1.5866 - val_acc: 0.7764
Accuracy: 77.64%
Average precision score: 0.76
Recall score: 0.71
Training session: 4
Train on 643 samples, validate on 161 samples
Epoch 1/25
7s - loss: 0.9045 - acc: 0.5614 - val_loss: 0.6394 - val_acc: 0.6522
Epoch 2/25
6s - loss: 0.5098 - acc: 0.7185 - val_loss: 0.6246 - val_acc

6s - loss: 0.0314 - acc: 0.9922 - val_loss: 1.7003 - val_acc: 0.7205
Epoch 14/25
7s - loss: 0.0229 - acc: 0.9907 - val_loss: 1.6548 - val_acc: 0.7205
Epoch 15/25
6s - loss: 0.0228 - acc: 0.9907 - val_loss: 1.6539 - val_acc: 0.7205
Epoch 16/25
6s - loss: 0.0189 - acc: 0.9922 - val_loss: 1.8854 - val_acc: 0.7267
Epoch 17/25
6s - loss: 0.0177 - acc: 0.9938 - val_loss: 1.8193 - val_acc: 0.7329
Epoch 18/25
6s - loss: 0.0140 - acc: 0.9953 - val_loss: 1.6778 - val_acc: 0.7453
Epoch 19/25
6s - loss: 0.0152 - acc: 0.9938 - val_loss: 2.0026 - val_acc: 0.7516
Epoch 20/25
6s - loss: 0.0097 - acc: 0.9953 - val_loss: 1.8485 - val_acc: 0.7391
Epoch 21/25
7s - loss: 0.0095 - acc: 0.9953 - val_loss: 1.8363 - val_acc: 0.7453
Epoch 22/25
7s - loss: 0.0086 - acc: 0.9938 - val_loss: 1.8549 - val_acc: 0.7453
Epoch 23/25
7s - loss: 0.0084 - acc: 0.9938 - val_loss: 1.8724 - val_acc: 0.7516
Epoch 24/25
7s - loss: 0.0070 - acc: 0.9953 - val_loss: 1.7744 - val_acc: 0.7640
Epoch 25/25
6s - loss: 0.0082 - acc: 0.9

7s - loss: 0.0108 - acc: 0.9984 - val_loss: 1.0974 - val_acc: 0.7888
Epoch 10/25
7s - loss: 0.0085 - acc: 0.9984 - val_loss: 1.1364 - val_acc: 0.7826
Epoch 11/25
6s - loss: 0.0078 - acc: 0.9969 - val_loss: 1.1636 - val_acc: 0.7826
Epoch 12/25
7s - loss: 0.0063 - acc: 0.9984 - val_loss: 1.1867 - val_acc: 0.7888
Epoch 13/25
7s - loss: 0.0076 - acc: 0.9969 - val_loss: 1.2059 - val_acc: 0.7950
Epoch 14/25
7s - loss: 0.0044 - acc: 0.9969 - val_loss: 1.2186 - val_acc: 0.7888
Epoch 15/25
7s - loss: 0.0067 - acc: 0.9984 - val_loss: 1.2384 - val_acc: 0.7888
Epoch 16/25
7s - loss: 0.0066 - acc: 0.9984 - val_loss: 1.3085 - val_acc: 0.7640
Epoch 17/25
7s - loss: 0.0082 - acc: 0.9984 - val_loss: 1.3067 - val_acc: 0.7888
Epoch 18/25
7s - loss: 0.0042 - acc: 0.9984 - val_loss: 1.2538 - val_acc: 0.7578
Epoch 19/25
7s - loss: 0.0061 - acc: 0.9984 - val_loss: 1.2373 - val_acc: 0.7888
Epoch 20/25
7s - loss: 0.0015 - acc: 1.0000 - val_loss: 1.3232 - val_acc: 0.7888
Epoch 21/25
6s - loss: 0.0047 - acc: 0.9

7s - loss: 0.0770 - acc: 0.9798 - val_loss: 0.5605 - val_acc: 0.8075
Epoch 6/25
7s - loss: 0.0476 - acc: 0.9844 - val_loss: 0.5027 - val_acc: 0.8323
Epoch 7/25
6s - loss: 0.0328 - acc: 0.9876 - val_loss: 0.5960 - val_acc: 0.8137
Epoch 8/25
6s - loss: 0.0271 - acc: 0.9922 - val_loss: 0.5366 - val_acc: 0.8696
Epoch 9/25
6s - loss: 0.0203 - acc: 0.9922 - val_loss: 0.5696 - val_acc: 0.8261
Epoch 10/25
7s - loss: 0.0183 - acc: 0.9922 - val_loss: 0.6379 - val_acc: 0.8199
Epoch 11/25
7s - loss: 0.0142 - acc: 0.9938 - val_loss: 0.5293 - val_acc: 0.8137
Epoch 12/25
7s - loss: 0.0150 - acc: 0.9922 - val_loss: 0.5260 - val_acc: 0.8447
Epoch 13/25
7s - loss: 0.0130 - acc: 0.9907 - val_loss: 0.5511 - val_acc: 0.8509
Epoch 14/25
7s - loss: 0.0123 - acc: 0.9938 - val_loss: 0.5610 - val_acc: 0.8696
Epoch 15/25
7s - loss: 0.0090 - acc: 0.9969 - val_loss: 0.5671 - val_acc: 0.8385
Epoch 16/25
7s - loss: 0.0073 - acc: 0.9984 - val_loss: 0.5476 - val_acc: 0.8571
Epoch 17/25
7s - loss: 0.0051 - acc: 0.9984 

In [13]:
nn_mean_ac = float(sum(nnAccuracy))/float(len(nnAccuracy))
nn_mean_pr = float(sum(nnPR))/float(len(nnPR))
nn_mean_recall = float(sum(nnRecall))/float(len(nnRecall))
cnn_mean_ac = float(sum(cnnAccuracy))/float(len(cnnAccuracy))
cnn_mean_pr = float(sum(cnnPR))/float(len(cnnPR))
cnn_mean_recall = float(sum(cnnRecall))/float(len(cnnRecall))
print("Overal NN result")
print("accuracy: " + str(nn_mean_ac))
print("precision score: " + str(nn_mean_pr))
print("recall: " + str(nn_mean_recall))
print("Overal CNN result")
print("accuracy: " + str(cnn_mean_ac))
print("precision score: " + str(cnn_mean_pr))
print("recall: " + str(cnn_mean_recall))

Overal NN result
accuracy: 0.7875776397515527
precision score: 0.752761580489891
recall: 0.5781818181818182
Overal CNN result
accuracy: 0.7962732919254659
precision score: 0.7520838007319237
recall: 0.7181818181818181


In [None]:
plot_model(cnnModel, to_file='cnnModel.png')
plot_model(model, to_file='nnModel.png')

In [None]:
#Save the model
# serialize model to JSON
model_json = model.to_json()
with open("saved_model/chinModel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("saved_model/chinModel.h5")
print("Saved model to disk")
cnnModel_json = cnnModel.to_json()
with open("saved_model/chinCnnModel.json", "w") as cnnJson_file:
    cnnJson_file.write(cnnModel_json)
# serialize weights to HDF5
cnnModel.save_weights("saved_model/chinCnnModel.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('saved_model/chinCnnModel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_model/chinCnnModel.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
#loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

### NN model for English data

In [None]:
#Read in the csv
engDataDf = pd.read_csv("data_with_features/eng_cleaned_data_f.csv", encoding='UTF-8')
#Only take the text and sentiment columns
engDataDf = engDataDf[['text', 'depressed']]
#Cleaning
for index, row in engDataDf.iterrows():
    #Preprocessing
    chinText, engText = tfExtractor.splitChinEng(row['text'])
    text = tfExtractor.engPreprocessing(engText)
    engDataDf.set_value(index,'text',text)
#Convert data to numpy array
X = np.array(engDataDf['text'].tolist())
Y = np.array(engDataDf['depressed'].tolist())
#Convert -1 label to 0
i = 0
for label in Y:
    if(label == -1):
        Y[i] = 0
    i += 1

In [None]:
#Original number of words: 46708
#Set top words
topWords = 5000
#Tokenizing the data
tokenizer = Tokenizer(num_words=topWords)
tokenizer.fit_on_texts(X)
print("tokenizer fitting is complete")
xSeq = tokenizer.texts_to_sequences(X)
wordIndex = tokenizer.word_index
print("Number of words: " + str(len(wordIndex)))

In [None]:
#Get review mean length
lengths = [len(i) for i in xSeq]
print("review mean length: " + str(np.mean(lengths)))
#Set maximum review length to cover at least 90% of review content
maxReviewLength = int(np.percentile(lengths, 90))
print("maximum review length: " + str(maxReviewLength))

#Set paddings for review data
xPadded = pad_sequences(xSeq, maxlen=maxReviewLength)
print("Done padding")

In [None]:
nnAccuracy = []
nnPR = []
nnRecall = []
cnnAccuracy = []
cnnPR = []
cnnRecall = []
for i in range(10):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=i, stratify=Y)
    
    #Create model
    engModel = creatNnModel()
    engCnnModel = createCnnModel()
    
    # Fit the model
    engModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=20, batch_size=50, verbose=2)
    # Final evaluation of the model
    scores = engModel.evaluate(xTest, yTest, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    yPred = engModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    average_precision = average_precision_score(yTest, yPred)
    nn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    nnAccuracy.append(scores[1])
    nnPR.append(average_precision)
    nnRecall.append(nn_recall_score)
    
    # Fit the model
    engCnnModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=20, batch_size=50, verbose=2)
    # Final evaluation of the model
    cnnscores = engCnnModel.evaluate(xTest, yTest, verbose=0)
    yPred = engCnnModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    cnn_average_precision = average_precision_score(yTest, yPred)
    cnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (cnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(cnn_average_precision))
    print('Recall score: {0:0.2f}'.format(cnn_recall_score))
    cnnAccuracy.append(cnnscores[1])
    cnnPR.append(cnn_average_precision)
    cnnRecall.append(cnn_recall_score)

In [None]:
nn_mean_ac = float(sum(nnAccuracy))/float(len(nnAccuracy))
nn_mean_pr = float(sum(nnPR))/float(len(nnPR))
nn_mean_recall = float(sum(nnRecall))/float(len(nnRecall))
cnn_mean_ac = float(sum(cnnAccuracy))/float(len(cnnAccuracy))
cnn_mean_pr = float(sum(cnnPR))/float(len(cnnPR))
cnn_mean_recall = float(sum(cnnRecall))/float(len(cnnRecall))
print("Overal NN result")
print("accuracy: " + str(nn_mean_ac))
print("precision score: " + str(nn_mean_pr))
print("recall: " + str(nn_mean_recall))
print("Overal CNN result")
print("accuracy: " + str(cnn_mean_ac))
print("precision score: " + str(cnn_mean_pr))
print("recall: " + str(cnn_mean_recall))

In [None]:
yPred = cnnModel.predict(xTest)
average_precision = average_precision_score(yTest, yPred)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))