In [8]:
from __future__ import print_function
from IPython.display import display, HTML
import os
import sys
import csv
import time
import statistics
import numpy as np
import string
import re
import pandas as pd
import text_features_extractor as tfExtractor
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, recall_score
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, model_from_json
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import make_pipeline
import tensorflow
from lime import lime_text
import pickle

### NN model for Chinese data

In [9]:
#Read in the csv
chinDataDf = pd.read_csv("data_with_features/chin_cleaned_data_f.csv", encoding='UTF-8')
#Only take the text and sentiment columns
chinDataDf = chinDataDf[['text', 'depressed']]
#Cleaning
for index, row in chinDataDf.iterrows():
    #Preprocessing
    chinText, engText = tfExtractor.splitChinEng(row['text'])
    text = tfExtractor.chinPreprocessing(chinText)
    chinDataDf.set_value(index,'text',text)
#Convert data to numpy array
X = np.array(chinDataDf['text'].tolist())
Y = np.array(chinDataDf['depressed'].tolist())
#Convert -1 label to 0
i = 0
for label in Y:
    if(label == -1):
        Y[i] = 0
    i += 1

In [10]:
#Original number of words: 46708
#Set top words
topWords = 5000
#Tokenizing the data
tokenizer = Tokenizer(num_words=topWords)
xString = []
for text in X:
    xString.append(' '.join(text))
tokenizer.fit_on_texts(xString)
print("tokenizer fitting is complete")
xSeq = tokenizer.texts_to_sequences(xString)
wordIndex = tokenizer.word_index
print("Number of words: " + str(len(wordIndex)))

tokenizer fitting is complete
Number of words: 31152


In [11]:
#Save tokenizer 
pickle.dump(tokenizer, open('saved_model/chinTokenizer.p', 'wb'))

In [12]:
#Get review mean length
lengths = [len(i) for i in xSeq]
print("review mean length: " + str(np.mean(lengths)))
#Set maximum review length to cover at least 90% of review content
maxReviewLength = int(np.percentile(lengths, 90))
print("maximum review length: " + str(maxReviewLength))

#Set paddings for review data
xPadded = pad_sequences(xSeq, maxlen=maxReviewLength)
print("Done padding")

review mean length: 273.425373134
maximum review length: 583
Done padding


In [13]:
def creatNnModel():
    # Simple multilayer perceptron model
    model = Sequential()
    #Embedding layer
    model.add(Embedding(topWords, 128, input_length=maxReviewLength))
    #Flattening
    model.add(Flatten())
    #Hidden layer
    model.add(Dense(250, activation='relu'))
    #Output layer
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    return model

def createCnnModel():
    #CNN model
    cnnModel = Sequential()
    cnnModel.add(Embedding(topWords, 128, input_length=maxReviewLength))
    cnnModel.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
    cnnModel.add(MaxPooling1D(pool_size=2))
    cnnModel.add(Flatten())
    cnnModel.add(Dense(250, activation='relu'))
    cnnModel.add(Dense(1, activation='sigmoid'))
    cnnModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(cnnModel.summary())
    return cnnModel

In [14]:
nnAccuracy = []
nnPR = []
nnRecall = []
cnnAccuracy = []
cnnPR = []
cnnRecall = []
for i in range(1):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=i, stratify=Y)
    
    #Create model
    model = creatNnModel()
    cnnModel = createCnnModel()
    
    # Fit the model
    model.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=25, batch_size=50, verbose=2)
    # Final evaluation of the model
    scores = model.evaluate(xTest, yTest, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    yPred = model.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    average_precision = average_precision_score(yTest, yPred)
    nn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print('Average precision score: {0:0.2f}'.format(average_precision))
    print('Recall score: {0:0.2f}'.format(nn_recall_score))
    nnAccuracy.append(scores[1])
    nnPR.append(average_precision)
    nnRecall.append(nn_recall_score)
    
    # Fit the model
    cnnModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=25, batch_size=50, verbose=2)
    # Final evaluation of the model
    cnnscores = cnnModel.evaluate(xTest, yTest, verbose=0)
    yPred = cnnModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    cnn_average_precision = average_precision_score(yTest, yPred)
    cnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (cnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(cnn_average_precision))
    print('Recall score: {0:0.2f}'.format(cnn_recall_score))
    cnnAccuracy.append(cnnscores[1])
    cnnPR.append(cnn_average_precision)
    cnnRecall.append(cnn_recall_score)

Training session: 0
Train on 643 samples, validate on 161 samples
Epoch 1/25
4s - loss: 0.9119 - acc: 0.5630 - val_loss: 0.6170 - val_acc: 0.6708
Epoch 2/25
3s - loss: 0.4814 - acc: 0.7823 - val_loss: 0.6071 - val_acc: 0.6584
Epoch 3/25
3s - loss: 0.3436 - acc: 0.8725 - val_loss: 0.5566 - val_acc: 0.7143
Epoch 4/25
3s - loss: 0.2028 - acc: 0.9502 - val_loss: 0.5180 - val_acc: 0.7453
Epoch 5/25
3s - loss: 0.0963 - acc: 0.9860 - val_loss: 0.4946 - val_acc: 0.7453
Epoch 6/25
3s - loss: 0.0484 - acc: 0.9938 - val_loss: 0.5041 - val_acc: 0.7453
Epoch 7/25
3s - loss: 0.0250 - acc: 0.9984 - val_loss: 0.4938 - val_acc: 0.7702
Epoch 8/25
3s - loss: 0.0191 - acc: 0.9984 - val_loss: 0.5199 - val_acc: 0.7702
Epoch 9/25
3s - loss: 0.0124 - acc: 1.0000 - val_loss: 0.5090 - val_acc: 0.7826
Epoch 10/25
3s - loss: 0.0073 - acc: 0.9984 - val_loss: 0.5275 - val_acc: 0.7578
Epoch 11/25
3s - loss: 0.0047 - acc: 1.0000 - val_loss: 0.5255 - val_acc: 0.7702
Epoch 12/25
3s - loss: 0.0037 - acc: 1.0000 - val_lo

In [16]:
nn_mean_ac = float(sum(nnAccuracy))/float(len(nnAccuracy))
nn_mean_pr = float(sum(nnPR))/float(len(nnPR))
nn_mean_recall = float(sum(nnRecall))/float(len(nnRecall))
cnn_mean_ac = float(sum(cnnAccuracy))/float(len(cnnAccuracy))
cnn_mean_pr = float(sum(cnnPR))/float(len(cnnPR))
cnn_mean_recall = float(sum(cnnRecall))/float(len(cnnRecall))
print("Overal NN result")
print("accuracy: " + str(nn_mean_ac))
print("precision score: " + str(nn_mean_pr))
print("recall: " + str(nn_mean_recall))
print("Overal CNN result")
print("accuracy: " + str(cnn_mean_ac))
print("precision score: " + str(cnn_mean_pr))
print("recall: " + str(cnn_mean_recall))

Overal NN result
accuracy: 0.7639751552795031
precision score: 0.7188205686066103
recall: 0.5818181818181818
Overal CNN result
accuracy: 0.7701863354037267
precision score: 0.6706246305688812
recall: 0.6363636363636364


In [None]:
plot_model(cnnModel, to_file='cnnModel.png')
plot_model(model, to_file='nnModel.png')

In [17]:
#Save the model
# serialize model to JSON
model_json = model.to_json()
with open("saved_model/chinModel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("saved_model/chinModel.h5")
print("Saved model to disk")
cnnModel_json = cnnModel.to_json()
with open("saved_model/chinCnnModel.json", "w") as cnnJson_file:
    cnnJson_file.write(cnnModel_json)
# serialize weights to HDF5
cnnModel.save_weights("saved_model/chinCnnModel.h5")
print("Saved model to disk")

Saved model to disk
Saved model to disk


In [None]:
# load json and create model
json_file = open('saved_model/chinCnnModel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("saved_model/chinCnnModel.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
#loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

### NN model for English data

In [22]:
#Read in the csv
engDataDf = pd.read_csv("data_with_features/eng_cleaned_data_f.csv", encoding='UTF-8')
#Only take the text and sentiment columns
engDataDf = engDataDf[['text', 'depressed']]
#Cleaning
for index, row in engDataDf.iterrows():
    #Preprocessing
    chinText, engText = tfExtractor.splitChinEng(row['text'])
    text = tfExtractor.engPreprocessing(engText)
    engDataDf.set_value(index,'text',text)
#Convert data to numpy array
X = np.array(engDataDf['text'].tolist())
Y = np.array(engDataDf['depressed'].tolist())
#Convert -1 label to 0
i = 0
for label in Y:
    if(label == -1):
        Y[i] = 0
    i += 1

In [23]:
#Original number of words: 46708
#Set top words
topWords = 5000
#Tokenizing the data
tokenizer = Tokenizer(num_words=topWords)
tokenizer.fit_on_texts(X)
print("tokenizer fitting is complete")
xSeq = tokenizer.texts_to_sequences(X)
wordIndex = tokenizer.word_index
print("Number of words: " + str(len(wordIndex)))

tokenizer fitting is complete
Number of words: 7413


In [24]:
#Save tokenizer 
pickle.dump(tokenizer, open('saved_model/engTokenizer.p', 'wb'))

In [25]:
#Get review mean length
lengths = [len(i) for i in xSeq]
print("review mean length: " + str(np.mean(lengths)))
#Set maximum review length to cover at least 90% of review content
maxReviewLength = int(np.percentile(lengths, 90))
print("maximum review length: " + str(maxReviewLength))

#Set paddings for review data
xPadded = pad_sequences(xSeq, maxlen=maxReviewLength)
print("Done padding")

review mean length: 11.1305322129
maximum review length: 17
Done padding


In [26]:
nnAccuracy = []
nnPR = []
nnRecall = []
cnnAccuracy = []
cnnPR = []
cnnRecall = []
for i in range(1):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=i, stratify=Y)
    
    #Create model
    engModel = creatNnModel()
    engCnnModel = createCnnModel()
    
    # Fit the model
    engModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=20, batch_size=50, verbose=2)
    # Final evaluation of the model
    scores = engModel.evaluate(xTest, yTest, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    yPred = engModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    average_precision = average_precision_score(yTest, yPred)
    nn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    nnAccuracy.append(scores[1])
    nnPR.append(average_precision)
    nnRecall.append(nn_recall_score)
    
    # Fit the model
    engCnnModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=20, batch_size=50, verbose=2)
    # Final evaluation of the model
    cnnscores = engCnnModel.evaluate(xTest, yTest, verbose=0)
    yPred = engCnnModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    cnn_average_precision = average_precision_score(yTest, yPred)
    cnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (cnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(cnn_average_precision))
    print('Recall score: {0:0.2f}'.format(cnn_recall_score))
    cnnAccuracy.append(cnnscores[1])
    cnnPR.append(cnn_average_precision)
    cnnRecall.append(cnn_recall_score)

Training session: 0
Train on 2856 samples, validate on 714 samples
Epoch 1/20
1s - loss: 0.4745 - acc: 0.7637 - val_loss: 0.3156 - val_acc: 0.8417
Epoch 2/20
1s - loss: 0.1475 - acc: 0.9454 - val_loss: 0.2810 - val_acc: 0.8683
Epoch 3/20
1s - loss: 0.0288 - acc: 0.9923 - val_loss: 0.2939 - val_acc: 0.8697
Epoch 4/20
1s - loss: 0.0126 - acc: 0.9972 - val_loss: 0.3209 - val_acc: 0.8669
Epoch 5/20
1s - loss: 0.0081 - acc: 0.9993 - val_loss: 0.3392 - val_acc: 0.8683
Epoch 6/20
1s - loss: 0.0060 - acc: 0.9989 - val_loss: 0.3566 - val_acc: 0.8697
Epoch 7/20
1s - loss: 0.0057 - acc: 0.9989 - val_loss: 0.3718 - val_acc: 0.8697
Epoch 8/20
1s - loss: 0.0044 - acc: 0.9982 - val_loss: 0.4051 - val_acc: 0.8655
Epoch 9/20
1s - loss: 0.0050 - acc: 0.9989 - val_loss: 0.4041 - val_acc: 0.8655
Epoch 10/20
1s - loss: 0.0041 - acc: 0.9993 - val_loss: 0.4093 - val_acc: 0.8641
Epoch 11/20
1s - loss: 0.0034 - acc: 0.9989 - val_loss: 0.4676 - val_acc: 0.8543
Epoch 12/20
1s - loss: 0.0060 - acc: 0.9989 - val_l

In [None]:
nn_mean_ac = float(sum(nnAccuracy))/float(len(nnAccuracy))
nn_mean_pr = float(sum(nnPR))/float(len(nnPR))
nn_mean_recall = float(sum(nnRecall))/float(len(nnRecall))
cnn_mean_ac = float(sum(cnnAccuracy))/float(len(cnnAccuracy))
cnn_mean_pr = float(sum(cnnPR))/float(len(cnnPR))
cnn_mean_recall = float(sum(cnnRecall))/float(len(cnnRecall))
print("Overal NN result")
print("accuracy: " + str(nn_mean_ac))
print("precision score: " + str(nn_mean_pr))
print("recall: " + str(nn_mean_recall))
print("Overal CNN result")
print("accuracy: " + str(cnn_mean_ac))
print("precision score: " + str(cnn_mean_pr))
print("recall: " + str(cnn_mean_recall))

In [None]:
#Save the model
# serialize model to JSON
model_json = engModel.to_json()
with open("saved_model/engModel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
engModel.save_weights("saved_model/engModel.h5")
print("Saved model to disk")
cnnModel_json = engCnnModel.to_json()
with open("saved_model/engCnnModel.json", "w") as cnnJson_file:
    cnnJson_file.write(cnnModel_json)
# serialize weights to HDF5
engCnnModel.save_weights("saved_model/engCnnModel.h5")
print("Saved model to disk")

In [None]:
yPred = cnnModel.predict(xTest)
average_precision = average_precision_score(yTest, yPred)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))