In [1]:
from __future__ import print_function
from IPython.display import display, HTML
import os
import sys
import csv
import time
import statistics
import numpy as np
import string
import re
import pandas as pd
import text_features_extractor as tfExtractor
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, recall_score
from sklearn.model_selection import GridSearchCV
#Keras
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, model_from_json, Model
from keras.layers import Dense, Input, LSTM, Bidirectional, Flatten, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.preprocessing import sequence
from keras.utils.vis_utils import plot_model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import make_pipeline
import tensorflow
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import pickle
import jieba.posseg as pseg

import modelCreator as mCreator
import lvExtractor as le

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
seed = 10
np.random.seed(seed)

## Data preprocessing

In [3]:
#Read in the csv
chinDataDf = pd.read_csv("data_with_features/chin_cleaned_data_f.csv", encoding='UTF-8')
#Only take the text and sentiment columns
chinDataDf = chinDataDf[['text', 'depressed']]
#Cleaning
for index, row in chinDataDf.iterrows():
    #Preprocessing
    chinText, engText = tfExtractor.splitChinEng(row['text'])
    text = tfExtractor.chinPreprocessing(chinText)
    chinDataDf.set_value(index,'text',text)
#Convert data to numpy array
X = np.array(chinDataDf['text'].tolist())
iCount = 0
deleteIndex = []
for wList in X:
    if(wList == []):
        deleteIndex.append(iCount)
    iCount += 1
Y = np.array(chinDataDf['depressed'].tolist())
#Convert -1 label to 0
i = 0
for label in Y:
    if(label == -1):
        Y[i] = 0
    i += 1
print("Before empty list deletion")
print(X.shape)
print(Y.shape)
X = np.delete(X, deleteIndex)
Y = np.delete(Y, deleteIndex)
print("After empty list deletion")
print(X.shape)
print(Y.shape)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\richi\AppData\Local\Temp\jieba.cache
Loading model cost 0.945 seconds.
Prefix dict has been built succesfully.
  # Remove the CWD from sys.path while we load stuff.


Before empty list deletion
(804,)
(804,)
After empty list deletion
(801,)
(801,)


In [None]:
cnnModel =mCreator.createCnnMcModel(584, topWords)
lexCnnModel = mCreator.createLexCnnMcModel(5000, 10)

### CNN_MC model for Chinese data

In [19]:
#Original number of words: 46708
#Set top words
topWords = 5000
#Tokenizing the data
tokenizer = Tokenizer(num_words=topWords)
xString = []
for text in X:
    xString.append(' '.join(text))
tokenizer.fit_on_texts(xString)
print("tokenizer fitting is complete")
xSeq = tokenizer.texts_to_sequences(xString)
wordIndex = tokenizer.word_index
print("Number of words: " + str(len(wordIndex)))

tokenizer fitting is complete
Number of words: 31152


In [20]:
#Get review mean length
lengths = [len(i) for i in xSeq]
print("review mean length: " + str(np.mean(lengths)))
#Set maximum review length to cover at least 90% of review content
maxReviewLength = int(np.percentile(lengths, 90))
print("maximum review length: " + str(maxReviewLength))
length = maxReviewLength
vocab_size = topWords
#Set paddings for review data
xPadded = pad_sequences(xSeq, maxlen=maxReviewLength)
print("Done padding")

review mean length: 274.44943820224717
maximum review length: 584
Done padding


## Tuning batch size and epoch size

In [None]:
# split into input (X) and output (Y) variables
xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=i, stratify=Y)
print(xTrain.shape)
print(yTrain.shape)
# create model
model = KerasClassifier(build_fn=mCreator.createCnnMcModel,length=maxReviewLength, vocab_size=topWords, verbose=2)
# define the grid search parameters
batch_size = [1,5,10, 20, 40, 60, 80, 100]
epochs = [10,20,30,40, 50,60,70,80,90, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit([xTrain,xTrain,xTrain,xTrain], yTrain)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Training

### Word Embedding CNN

In [None]:
cnnAccuracy = []
cnnPR = []
cnnRecall = []
for i in range(1):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(xPadded, Y, test_size=0.2, shuffle=True, random_state=seed, stratify=Y)
    #Create model
    cnnModel =mCreator.createCnnMcModel(maxReviewLength, topWords)
    
    # Fit the model
    cnnModel.fit([xTrain, xTrain, xTrain, xTrain], yTrain, epochs=30, batch_size=3, verbose=2)
    # Final evaluation of the model
    cnnscores = cnnModel.evaluate([xTest,xTest,xTest, xTest], yTest, verbose=0)
    yPred = cnnModel.predict([xTest, xTest, xTest, xTest])
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    cnn_average_precision = average_precision_score(yTest, yPred)
    cnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (cnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(cnn_average_precision))
    print('Recall score: {0:0.2f}'.format(cnn_recall_score))
    cnnAccuracy.append(cnnscores[1])
    cnnPR.append(cnn_average_precision)
    cnnRecall.append(cnn_recall_score)

Training session: 0
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 584)          0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, 584)          0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           (None, 584)          0                                            
__________________________________________________________________________________________________
input_36 (InputLayer)           (None, 584)          0                                            
_________________________________________________________________________________________

In [None]:
cnn_mean_ac = float(sum(cnnAccuracy))/float(len(cnnAccuracy))
cnn_mean_pr = float(sum(cnnPR))/float(len(cnnPR))
cnn_mean_recall = float(sum(cnnRecall))/float(len(cnnRecall))
print("Overal CNN MC result")
print("accuracy: " + str(cnn_mean_ac))
print("precision score: " + str(cnn_mean_pr))
print("recall: " + str(cnn_mean_recall))

### Lexicon CNN 

In [3]:
#Get lexicon embedding
lexEmbeddings = []
lexProcessCount = 0
for wList in X:
    if(len(wList) > maxReviewLength):
        text = ''.join(wList[0:maxReviewLength])
    else:
        text = ''.join(wList)
    lexEmbedding = le.getLexiconVector(text)
    lexEmbeddings.append(lexEmbedding)
    lexProcessCount += 1
    #print("Count: " + str(lexProcessCount))

In [4]:
#Perform padding 
#Get max
maxDim = lexEmbeddings[0].shape[0]
for l in lexEmbeddings:
    if(l.shape[0] > maxDim):
        maxDim = l.shape[0]
print("MaxDim: " + str(maxDim))
for i in range(len(lexEmbeddings)):
    diffDim = maxDim - lexEmbeddings[i].shape[0]
    if(diffDim > 0):
        while(diffDim > 0):
            padVec = np.zeros([10,1])
            lexEmbeddings[i] = np.vstack((lexEmbeddings[i], padVec[None]))
            diffDim -= 1

MaxDim: 7283


In [5]:
#Check for same dim
for l in lexEmbeddings:
    if(l.shape[0] == topWords):
        continue
    else:
        print("Something is wrong!")

In [10]:
lexCnnAccuracy = []
lexCnnPR = []
lexCnnRecall = []
for i in range(1):
    print("Training session: " + str(i))
    #Split data into training and test set (80%/20%)
    xTrain, xTest, yTrain, yTest = train_test_split(np.squeeze(np.stack(lexEmbeddings)), Y, test_size=0.2, shuffle=True, random_state=seed, stratify=Y)
    #Create model
    lexCnnModel = mCreator.createLexCnnMcModel(maxDim, 10)
    # Fit the model
    lexCnnModel.fit([xTrain, xTrain, xTrain, xTrain], yTrain, epochs=30, batch_size=3, verbose=2)
    # Final evaluation of the model
    lexCnnscores = lexCnnModel.evaluate([xTest,xTest,xTest, xTest], yTest, verbose=0)
    yPred = lexCnnModel.predict([xTest, xTest, xTest, xTest])
    yPredNorm = []
    for p in yPred:
        if p > 0.5:
            yPredNorm.append(1)
        else:
            yPredNorm.append(0)
    lexCnn_average_precision = average_precision_score(yTest, yPred)
    lexCnn_recall_score = recall_score(yTest, yPredNorm, average='binary')
    print("Accuracy: %.2f%%" % (lexCnnscores[1]*100))
    print('Average precision score: {0:0.2f}'.format(lexCnn_average_precision))
    print('Recall score: {0:0.2f}'.format(lexCnn_recall_score))
    lexCnnAccuracy.append(lexCnnscores[1])
    lexCnnPR.append(lexCnn_average_precision)
    lexCnnRecall.append(lexCnn_recall_score)

Training session: 0
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 7283, 10)     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 7283, 10)     0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 7283, 10)     0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 7283, 10)     0                                            
_________________________________________________________________________________________

Epoch 1/30


KeyboardInterrupt: 