In [None]:
from __future__ import print_function
from IPython.display import display, HTML
import os
import sys
import csv
import time
import statistics
import numpy as np
import string
import re
import pandas as pd
import text_features_extractor as tfExtractor
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, recall_score
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, model_from_json
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import make_pipeline
import tensorflow
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import pickle

In [None]:
def createRnnModel():
    #CNN model
    rnnModel = Sequential()
    rnnModel.add(Embedding(5000, 256, input_length=maxReviewLength))
    rnnModel.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
    rnnModel.add(MaxPooling1D(pool_size=2))
    rnnModel.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    rnnModel.add(Dense(1, activation='sigmoid'))
    rnnModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(rnnModel.summary())
    return rnnModel

### NN model for Chinese data

In [None]:
#Read in the csv
chinDataDf = pd.read_csv("data_with_features/chin_cleaned_data_f.csv", encoding='UTF-8')
#Only take the text and sentiment columns
chinDataDf = chinDataDf[['text', 'depressed']]
#Cleaning
for index, row in chinDataDf.iterrows():
    #Preprocessing
    chinText, engText = tfExtractor.splitChinEng(row['text'])
    text = tfExtractor.chinPreprocessing(chinText)
    chinDataDf.set_value(index,'text',text)
#Convert data to numpy array
X = np.array(chinDataDf['text'].tolist())
Y = np.array(chinDataDf['depressed'].tolist())
#Convert -1 label to 0
i = 0
for label in Y:
    if(label == -1):
        Y[i] = 0
    i += 1

In [None]:
#Original number of words: 46708
#Set top words
topWords = 5000
#Tokenizing the data
tokenizer = Tokenizer(num_words=topWords)
xString = []
for text in X:
    xString.append(' '.join(text))
tokenizer.fit_on_texts(xString)
print("tokenizer fitting is complete")
xSeq = tokenizer.texts_to_sequences(xString)
wordIndex = tokenizer.word_index
print("Number of words: " + str(len(wordIndex)))

In [None]:
#Get review mean length
lengths = [len(i) for i in xSeq]
print("review mean length: " + str(np.mean(lengths)))
#Set maximum review length to cover at least 90% of review content
maxReviewLength = int(np.percentile(lengths, 90))
print("maximum review length: " + str(maxReviewLength))

#Set paddings for review data
xPadded = pad_sequences(xSeq, maxlen=maxReviewLength)
print("Done padding")

In [None]:
rnnAccuracy = []
rnnPR = []
rnnRecall = []
for i in range(10):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=i, stratify=Y)
    
    #Create model
    rnnModel = createRnnModel()
    
    # Fit the model
    rnnModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=50, batch_size=1, verbose=2)
    # Final evaluation of the model
    rnnscores = rnnModel.evaluate(xTest, yTest, verbose=0)
    yPred = rnnModel.predict(xTest)
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    rnn_average_precision = average_precision_score(yTest, yPred)
    rnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (rnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(rnn_average_precision))
    print('Recall score: {0:0.2f}'.format(rnn_recall_score))
    rnnAccuracy.append(rnnscores[1])
    rnnPR.append(rnn_average_precision)
    rnnRecall.append(rnn_recall_score)

In [None]:
rnn_mean_ac = float(sum(rnnAccuracy))/float(len(rnnAccuracy))
rnn_mean_pr = float(sum(rnnPR))/float(len(rnnPR))
rnn_mean_recall = float(sum(rnnRecall))/float(len(rnnRecall))
print("Overal RNN result")
print("accuracy: " + str(rnn_mean_ac))
print("precision score: " + str(rnn_mean_pr))
print("recall: " + str(rnn_mean_recall))