In [1]:
# keras
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding

# sklearn
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re


from sklearn.manifold import TSNE



In [2]:
train = pd.read_csv('./train.csv')

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.shape

(1306122, 3)

In [5]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


In [6]:
clean_questions = (re.sub("[^A-Za-z']+", ' ', q).lower() for q in train['question_text'])

In [7]:
stopwords = list(nltk.corpus.stopwords.words('english'))

In [8]:
%%time
# remove stop words and lower all characters
clean_questions = [' '.join(w for w in nltk.word_tokenize(q.lower()) if w not in stopwords) for q in clean_questions]

Wall time: 3min 11s


In [9]:
train['clean_question'] = list(clean_questions)

In [10]:
train.head()

Unnamed: 0,qid,question_text,target,clean_question
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,quebec nationalists see province nation
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,adopted dog would encourage people adopt shop
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,velocity affect time velocity affect space geo...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,otto von guericke used magdeburg hemispheres
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,convert montra helicon mountain bike changing ...


In [11]:
X = train['clean_question']
y = train.target

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=495, test_size=0.2)

In [13]:
%%time
tokenizer = Tokenizer(num_words= 100000)
tokenizer.fit_on_texts(X_train) # assign unique feature number to each token.

Train_sequences = tokenizer.texts_to_sequences(X_train) # convert tokens in each question to feature numbers.
X_train_data = pad_sequences(Train_sequences, maxlen=50) # Make all questions have the same number of features

Test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_data = pad_sequences(Test_sequences, maxlen=50)


Wall time: 41.2 s


In [14]:
len(tokenizer.word_counts)

163084

In [None]:
## Network architecture
model = Sequential() # layers are connected sequentially, layers can only communicate with it's adjacent layer
model.add(Embedding(100000, 100, input_length=50)) #convert each word to a vector position based on meaning, dense representation.

#dropouts randomly exclude units from activation or weight update, to reduce overfitting.
# dropout is for input layer, recurrent_dropout for internal layers.
# Dropout prevent each node to become too specialised.
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(1, activation='sigmoid')) # define final NN layer(output), with single node (0 or 1), sigmoid activation.
# Binary_crossentropy, for 0-1 classficiation, is a log function that punish predictions further from the actual value.
# Adam maintain per-parameter learning rate (good for sparse, NLP) and adapted learning rate based on recent changes in gradient(good for noisy data)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
## Fit the model
model.fit(X_train_data, y_train, validation_split=0.4, epochs=3)

Train on 626938 samples, validate on 417959 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
%%time
y_pred = model.predict(X_test_data, workers=6)

In [None]:
y_predRound = [ 1 if y >= 0.5 else 0  for y in y_pred]

In [None]:
print('accuracy %s' % accuracy_score(y_test, y_predRound))
print(classification_report(y_test, y_predRound))
confusion_matrix(y_test, y_predRound)

### 1D convolutional layer 

In [None]:
%%time
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(100000, 100, input_length=50))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(128, 10, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4)) # Return max value from 4 adjacent values, reduce computational requirement and reduce overfitting.
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])
    return model_conv
model_conv = create_conv_model()
model_conv.fit(X_train_data, y_train, validation_split=0.4, epochs = 3)

# Reduced computation time to 1/4.

In [None]:
%%time
y_pred = model_conv.predict(X_test_data, workers=6)

In [None]:
y_predRound = [ 1 if y >= 0.5 else 0  for y in y_pred]

In [None]:
print('accuracy %s' % accuracy_score(y_test, y_predRound))
print(classification_report(y_test, y_predRound))
confusion_matrix(y_test, y_predRound)

In [None]:

model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])