In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

import tensorflow as tf
import spacy

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
train_clean = pd.read_csv('/content/drive/MyDrive/ML Project - Quora/Data/train_clean.csv')
train_clean.head()

Unnamed: 0.1,Unnamed: 0,question_text,target,clean_text
0,0,How did Quebec nationalists see their province...,0,quebec nationalist see province nation
1,1,"Do you have an adopted dog, how would you enco...",0,adopted dog would encourage people adopt shop
2,2,Why does velocity affect time? Does velocity a...,0,velocity affect time velocity affect space geo...
3,3,How did Otto von Guericke used the Magdeburg h...,0,otto von guericke used magdeburg hemisphere
4,4,Can I convert montra helicon D to a mountain b...,0,convert montra helicon mountain bike changing ...


In [None]:

train_clean = train_clean.drop('question_text', axis=1) ## change to clean text later and see how it perform
train_clean.head()

Unnamed: 0.1,Unnamed: 0,target,clean_text
0,0,0,quebec nationalist see province nation
1,1,0,adopted dog would encourage people adopt shop
2,2,0,velocity affect time velocity affect space geo...
3,3,0,otto von guericke used magdeburg hemisphere
4,4,0,convert montra helicon mountain bike changing ...


In [None]:
train_clean = train_clean.sample(frac=2, replace=True, random_state=1)
train_clean = train_clean.drop(train_clean.columns[0],axis=1)
train_clean.head()

Unnamed: 0,target,clean_text
128037,0,trade policy place manage trade
491755,0,year day invented year day
470924,0,observe universe completely rest point space a...
491263,0,would possible bombard radioactive material ne...
836489,0,laptop suitable ece student


In [None]:
print(train_clean.shape)

(2612244, 2)


In [None]:
train, val = train_test_split(train_clean, test_size=0.07,random_state=1000)

train_X = train["clean_text"].fillna(' ')
val_X = val["clean_text"].fillna(' ')



max_features = 90000 ## probably increase this to 9000

# tokenize the sentence to convert them to vector sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

In [None]:
# pad sentence 
maxlen = 55

train_X = pad_sequences(train_X, maxlen=maxlen, padding = 'post')
val_X = pad_sequences(val_X, maxlen=maxlen, padding = 'post')

In [None]:
train_y = train['target'].values
val_y = val['target'].values

In [None]:
train_X

array([[  29, 1185, 5736, ...,    0,    0,    0],
       [ 178, 1943,  629, ...,    0,    0,    0],
       [ 518,  191,  168, ...,    0,    0,    0],
       ...,
       [4182, 3321, 2037, ...,    0,    0,    0],
       [  97,  145,   15, ...,    0,    0,    0],
       [8810,  353,    4, ...,    0,    0,    0]], dtype=int32)

In [None]:
# embedding process
# https://github.com/stanfordnlp/GloVe
# https://nlp.stanford.edu/pubs/glove.pdf

EMBEDDING_FILE = '/content/drive/MyDrive/ML Project - Quora/Data/glove.42B.300d/glove.42B.300d.txt' # define embedding file path

#TODO: add embedding code
embed_size = 25
f = open(EMBEDDING_FILE)
embedding_index = dict()

with open(EMBEDDING_FILE, 'r', encoding="utf-8") as f:
  for line in f:
    
    values = line.split()
    word = values[0]
    coef = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coef
  f.close()


In [None]:
import gc
gc.collect()
all_embs = np.stack(list(embedding_index.values()))
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print(emb_mean, ' ', emb_std)

0.005720101   0.2951066


In [None]:
nb_words = len(tokenizer.word_index)
embedding_matrix_glove = np.random.normal(emb_mean, emb_std, (nb_words, all_embs.shape[1]))

In [None]:
for work, i in tokenizer.word_index.items():
  if i >= max_features: continue
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix_glove[i] = embedding_vector

In [None]:
embedding_matrix_glove.shape

(153710, 300)

In [None]:
batch_size = 256

def batch_gen(train_df):
    n_batches = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_df["target"][i*batch_size:(i+1)*batch_size])

In [None]:
inp = Input(shape=(maxlen, ))
x = Embedding(len(tokenizer.word_index), embedding_matrix_glove.shape[1],weights=[embedding_matrix_glove],trainable=False)(inp)
x = SpatialDropout1D(0.3)(x)
# x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
# x = Bidirectional(CuDNNLSTM(64))(x)
x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
max_pool1 = GlobalMaxPooling1D()(x1)
max_pool2 = GlobalMaxPooling1D()(x2)
# x = GlobalMaxPool1D()(x)
# x = Dense(256, activation="relu")(x)
# x = Dropout(0.25)(x)
# x = Dense(1, activation="sigmoid")(x) # for binary classification
conc = Concatenate()([max_pool1, max_pool2])
conc = Dense(256, activation="relu")(conc)
conc = Dense(1, activation="relu")(conc)
predictions = Dense(1, activation='sigmoid')(conc)

In [None]:
from keras.optimizers import SGD
from tensorflow.keras.optimizers.legacy import Adam
opt = SGD(lr=0.01)
model = Model(inputs=inp, outputs=predictions)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 55)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 55, 300)      46113000    ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 55, 300)     0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 55, 512)      1142784     ['spatial_dropout1d[0][0]']  

  super().__init__(name, **kwargs)


In [None]:
%%time
model.fit(train_X, train_y, batch_size = 256, epochs=20, steps_per_epoch=1000, validation_data=(val_X,val_y), verbose=2, shuffle=True)

Epoch 1/20
1000/1000 - 78s - loss: 0.5269 - accuracy: 0.9374 - val_loss: 0.4048 - val_accuracy: 0.9375 - 78s/epoch - 78ms/step
Epoch 2/20
1000/1000 - 67s - loss: 0.3385 - accuracy: 0.9390 - val_loss: 0.2936 - val_accuracy: 0.9375 - 67s/epoch - 67ms/step
Epoch 3/20
1000/1000 - 69s - loss: 0.2692 - accuracy: 0.9378 - val_loss: 0.2525 - val_accuracy: 0.9375 - 69s/epoch - 69ms/step
Epoch 4/20
1000/1000 - 69s - loss: 0.2423 - accuracy: 0.9383 - val_loss: 0.2382 - val_accuracy: 0.9375 - 69s/epoch - 69ms/step
Epoch 5/20
1000/1000 - 69s - loss: 0.2353 - accuracy: 0.9378 - val_loss: 0.2344 - val_accuracy: 0.9375 - 69s/epoch - 69ms/step
Epoch 6/20
1000/1000 - 69s - loss: 0.2308 - accuracy: 0.9387 - val_loss: 0.2338 - val_accuracy: 0.9375 - 69s/epoch - 69ms/step
Epoch 7/20
1000/1000 - 69s - loss: 0.2314 - accuracy: 0.9384 - val_loss: 0.2338 - val_accuracy: 0.9375 - 69s/epoch - 69ms/step
Epoch 8/20
1000/1000 - 69s - loss: 0.2311 - accuracy: 0.9385 - val_loss: 0.2338 - val_accuracy: 0.9375 - 69s/ep

<keras.callbacks.History at 0x7ff6cd92b760>