In [2]:
import re
import os
import math
import nltk
import pickle
import random
import numpy as np
import tensorflow as tf
from nltk.corpus import wordnet
from tensorflow.contrib import rnn
from html.parser import HTMLParser
import xml.etree.ElementTree as ET
from nltk.stem import WordNetLemmatizer
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.models import Sequential,load_model
from keras.preprocessing.text import Tokenizer
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Flatten, Dense, Dropout, BatchNormalization

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)
Using TensorFlow backend.


In [3]:
#using pretrained glove embeddings to embed words
def get_embeddings_index():
    embeddings_index = {}
    f = open(os.path.join('./glove.6B', 'glove.6B.300d.txt')) #TODO try 300 dimensions
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index
embeddings_index = get_embeddings_index()

In [4]:
lemmatizer = WordNetLemmatizer()
def preprocess_words(word_list):
    processed_word_list = []
    for word in word_list.split():
        if wordnet.synsets(word):
            processed_word_list.append(lemmatizer.lemmatize(word))            
    return processed_word_list     

In [5]:
class CustomHTMLParser(HTMLParser):  
    a = ''
    def handle_data(self, data):
        self.a = self.a + str(data)        
    def get_raw_text(self):
        self.a = re.sub(r'[0-9_]+', ' ', self.a)
        self.a = re.sub(r'[^\w\s]', ' ', self.a)        
        return self.a    

In [6]:
#Preprocessing xml files to x train and y train data
def preprocess_data(path = './en/'): 
    x_data = []
    y_data = []
    for filename in os.listdir(path):    
        root = ET.parse(path + filename).getroot()
        #TODO add for other classifications, ie. age_group and multi-class 
        if(root.attrib['gender'] == 'male'):
            y = 1
        elif(root.attrib['gender'] == 'female'):
            y = 0       

        for text in root.findall('conversations/conversation'):            
            parser = CustomHTMLParser()
            parser.feed(str(text.text))
            removed_tags = parser.get_raw_text()
            word_list = preprocess_words(removed_tags)    
            x_data.append(word_list)            
            y_data.append(y)    
    return x_data, y_data

In [None]:
x_data, y_data = preprocess_data()

In [None]:
#store pre-processed input
def save_preprocessed_data(x_data, y_data): 
    with open("x_data_all_cnn.txt", "wb") as f:   
        pickle.dump(x_data, f)
    with open("y_data_all_cnn.txt", "wb") as f:   
        pickle.dump(y_data, f)     
save_preprocessed_data(x_data, y_data)            

In [6]:
x_data = ""
y_data = ""
with open("x_data_all_cnn.txt", "rb") as f:   
    x_data = pickle.load(f)
with open("y_data_all_cnn.txt", "rb") as f:   
    y_data = pickle.load(f)

In [7]:
def prepare_word_index(x_data):   
    tokenizer = Tokenizer(num_words=50000) #max features is 50000
    tokenizer.fit_on_texts(x_data)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index)) 
    return word_index
word_index = prepare_word_index(x_data)

def save_word_index(word_index):
    with open("word_index.txt", "wb") as f:   
        pickle.dump(word_index, f)
save_word_index(word_index)   

Found 77188 unique tokens.


In [7]:
max_text_length = 500
embedding_dim = 300 #change to other dim as well
total = 163371
def load_word_index():
    with open("word_index.txt", "rb") as f:   
        word_index = pickle.load(f)
        return word_index
word_index = load_word_index()    

In [8]:
def get_network_input(x_data, word_index, y_data, max_features=len(word_index)):
    x = []
    for text in x_data:
        text_ids = []
        for word in text:          
            word_id = word_index.get(word, -1)
            if word_id != -1 and word_id < max_features:
                text_ids.append(word_id)
        x.append(text_ids)

    #pad sequence length to max_text_length
    x = sequence.pad_sequences(x, maxlen=max_text_length, padding='post') 
    y = tf.keras.utils.to_categorical(y_data, num_classes=2)
    return x, y

In [9]:
#load entire batch as training data
def load_all_data(fx="x_data_all_cnn.txt", fy="y_data_all_cnn.txt", shuffle = False, seed=1000):
    with open(fx, "rb") as f:   
        x_data = pickle.load(f)
    with open(fy, "rb") as f:   
        y_data = pickle.load(f)    
    x, y = get_network_input(x_data, load_word_index(), y_data, 10000)   
    if(shuffle):
        np.random.seed(seed)
        r = np.arange(len(x))
        np.random.shuffle(r)
        x = np.asarray(x)
        y = np.asarray(y)
        x = x[r]
        y = y[r]         
    return x,y 

In [10]:
#building embedding matrix
def get_embedding_layer(embedding_dim):
    word_index = load_word_index()
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    #embedding layer
    embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                                input_length=max_text_length, trainable=False)
    return embedding_layer

In [11]:
x, y = load_all_data(fx="x_data_all_cnn.txt", fy="y_data_all_cnn.txt", shuffle = True)
#train on 140000 data samples out of 163371 samples, validation on 140000 to 150000
train_num = 150000
validation_num = 160000
x_train = x[:train_num]
y_train = y[:train_num]
x_val = x[train_num:validation_num]
y_val = y[train_num:validation_num]

In [12]:
def create_cnn_model():
    #pure cnn model
    model = Sequential()
    model.add(get_embedding_layer(embedding_dim))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.2))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.2))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))    
    model.add(BatchNormalization())
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])#try rmsprop
    print(model.summary())
    return model
model12 = create_cnn_model()
#model12.fit(x_train, y_train, epochs=25, batch_size=512, validation_data=(x_val, y_val))
#model12.save('model12_v1.h5')    

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 300)          23156700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 128)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxP

In [None]:
model12.fit(x_train, y_train, epochs=50, initial_epoch=25, batch_size=512, validation_data=(x_val, y_val))
model12.save('./models/model12_v2.h5')

In [None]:
model12 = load_model('./models/model12_v2.h5')
model12.evaluate(x_val, y_val, verbose=1)

In [None]:
model12 = load_model('model12_v1.h5')
model12.evaluate(x_val, y_val, verbose=1)

In [None]:
model12.fit(x_train, y_train, epochs=75, initial_epoch=50, batch_size=512, validation_data=(x_val, y_val))
model12.save('./models/model12_v3.h5')

In [None]:
model12 = load_model('./models/model12_v3.h5')
model12.evaluate(x_val, y_val, verbose=1)

In [None]:
def create_cnn_model_1():
    #pure cnn model
    model = Sequential()
    model.add(get_embedding_layer(embedding_dim))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.3))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.3))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))    
    model.add(BatchNormalization())
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])#try rmsprop
    print(model.summary())
    return model
model13 = create_cnn_model_1()
model13.fit(x_train, y_train, epochs=25, batch_size=512, validation_data=(x_val, y_val))
model13.save('model13_v1.h5')    

In [None]:
model13 = load_model('model13_v1.h5')
model13.evaluate(x_val, y_val, verbose=1)

In [None]:
model13.fit(x_train, y_train, epochs=50, initial_epoch=25, batch_size=512, validation_data=(x_val, y_val))
model13.save('./models/model13_v2.h5')

In [None]:
model13 = load_model('./models/model13_v2.h5')
model13.evaluate(x_val, y_val, verbose=1)

In [None]:
def create_cnn_model_2():
    #pure cnn model
    model = Sequential()
    model.add(get_embedding_layer(embedding_dim))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))    
    model.add(BatchNormalization())
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])#try rmsprop
    print(model.summary())
    return model
model14 = create_cnn_model()
model14.fit(x_train, y_train, epochs=25, batch_size=512, validation_data=(x_val, y_val))
model14.save('model14_v1.h5')    

In [None]:
model14 = load_model('model14_v1.h5')
model14.evaluate(x_val, y_val, verbose=1)

In [None]:
model14.fit(x_train, y_train, epochs=50, initial_epoch=25, batch_size=512, validation_data=(x_val, y_val))
model14.save('./models/model14_v2.h5')

In [None]:
model14 = load_model('./models/model14_v2.h5')
model14.evaluate(x_val, y_val, verbose=1)

In [13]:
def create_cnn_model_3():
    #pure cnn model
    model = Sequential()
    model.add(get_embedding_layer(embedding_dim))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.4))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.4))
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.4))
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(128, activation='relu'))    
    model.add(BatchNormalization())
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])#try rmsprop
    print(model.summary())
    return model
model15 = create_cnn_model_3()
model15.fit(x_train, y_train, epochs=25, batch_size=512, validation_data=(x_val, y_val))
model15.save('model15_v1.h5')    

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 300)          23156700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 128)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxP

In [14]:
model15 = load_model('model15_v1.h5')
model15.evaluate(x_val, y_val, verbose=1)



[0.8814251465797425, 0.5384]

In [15]:
model15.fit(x_train, y_train, epochs=50, initial_epoch=25, batch_size=512, validation_data=(x_val, y_val))
model15.save('./models/model15_v2.h5')

Train on 150000 samples, validate on 10000 samples
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [16]:
model15 = load_model('./models/model15_v2.h5')
model15.evaluate(x_val, y_val, verbose=1)



[0.8767600864410401, 0.5652]