In [36]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import io
import zipfile
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import resample

import nltk
nltk.download('popular', quiet=True)
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Embedding, Dense, Dropout, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c quora-insincere-questions-classification

Downloading quora-insincere-questions-classification.zip to /content
100% 6.02G/6.03G [01:35<00:00, 86.4MB/s]
100% 6.03G/6.03G [01:35<00:00, 67.5MB/s]


In [5]:
!unzip /content/quora-insincere-questions-classification.zip

Archive:  /content/quora-insincere-questions-classification.zip
  inflating: embeddings.zip          
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
train_data.shape

(1306122, 3)

There are 1306122 records with 3 columns for each record

In [4]:
train_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
train_data['target'].value_counts()

0    1225312
1      80810
Name: target, dtype: int64

The data is highly imbalanced with 1225312 questions being sincere and only about 80810 being insincere. Let's try downsampling the sincere questions.

In [6]:
sincere_qs = train_data[train_data['target'] == 0]
insincere_qs = train_data[train_data['target'] == 1]

In [7]:
print("sincere questions shape:", sincere_qs.shape, "\ninsincere questions shape:", insincere_qs.shape)

sincere questions shape: (1225312, 3) 
insincere questions shape: (80810, 3)


In [8]:
n_sincere = sincere_qs.shape[0]
n_insincere = insincere_qs.shape[0]

In [9]:
downsampled_sincere_qs = resample(sincere_qs, 
                                 replace=False,
                                 n_samples=int(0.5*n_sincere),     
                                 random_state=42) 

Now we combine our downsampled sincere questions with the insincere questions.

In [10]:
train_data = pd.concat([downsampled_sincere_qs, insincere_qs])

In [11]:
print('shape of train_data after downsampling:', train_data.shape)

shape of train_data after downsampling: (693466, 3)


In [12]:
len(train_data['target'].value_counts())

2

In [13]:
train_data.target.value_counts().sum()

693466

In [14]:
no_of_classes = len(train_data['target'].value_counts())
value_count_array = train_data.target.value_counts().to_numpy()
sample_size = train_data.target.value_counts().sum()
w_class = sample_size/(no_of_classes*value_count_array)
class_weight_dict = dict(zip(range(len(train_data['target'].value_counts())), w_class))

In [15]:
class_weight_dict

{0: 0.5659505497375362, 1: 4.290718970424453}

In [16]:
test_data = pd.read_csv('test.csv')

In [17]:
new_df = pd.concat([train_data,test_data])

In [18]:
print('new dataframe shape:',new_df.shape)

new dataframe shape: (1069272, 3)


In [19]:
def remove_URL(question):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'url',question)

def remove_punct(question):
    table = str.maketrans('','',string.punctuation)
    return question.translate(table)

def lower(question):
    words = question.split(" ")
    lower = " ".join([w.lower() for w in words])    
    return lower

In [20]:
new_df['question_text'] = new_df['question_text'].apply(remove_URL)
new_df['question_text'] = new_df['question_text'].apply(remove_punct)

In [21]:
new_df.head(5)

Unnamed: 0,qid,question_text,target
1022714,c86ab618f85e9b7fc374,Is sadism a coping mechanism for people who ar...,0.0
641364,7d9ea6d66b8866e69240,Is it possible for me as a soul to go outside ...,0.0
1225111,f01982d0cd06aba308ed,Do Pet Animal Rescue workers minimize shows of...,0.0
1130433,dd8a6b5452a407cea2ac,How do you identify a sonnet and what can we i...,0.0
1220402,ef30e73bf0a81a06ccf6,Is there a special place in hell for the likes...,0.0


In [22]:
def question_list(df):
    lst =[]
    for question in df['question_text']:
        qs=[text.lower() for text in word_tokenize(question)]
        qs=' '.join(qs)
        lst.append(qs)
    return lst   

In [23]:
questions=question_list(new_df)

In [24]:
questions[:5]

['is sadism a coping mechanism for people who are extremely affected by the suffering of others',
 'is it possible for me as a soul to go outside of this body and observe the universe',
 'do pet animal rescue workers minimize shows of affection so that the animal in question doesnt become attached to them before getting adopted',
 'how do you identify a sonnet and what can we infer from it and suggest',
 'is there a special place in hell for the likes of genghiz khan']

In [25]:
#form a dictionary of word with it's vector representation
embedding_kv={}
with zipfile.ZipFile("/content/embeddings.zip") as zfil:
    with io.TextIOWrapper(zfil.open("glove.840B.300d/glove.840B.300d.txt"), encoding="utf-8") as embd:
        for x in embd:
            res=x.split(' ') 
            k,vec=res[0],res[1:]
            v=np.asarray(vec,'float32')
            embedding_kv[k]=v

In [26]:
len(embedding_kv)

2196016

In [27]:
tokenizer = Tokenizer()

In [28]:
tokenizer.fit_on_texts([questions])
seq=tokenizer.texts_to_sequences(questions)
question_pad=pad_sequences(seq,maxlen=55,truncating='post',padding='post')

In [29]:
question_index=tokenizer.word_index
print('unique words:',len(question_index))

unique words: 1069117


In [30]:
num_words=len(question_index)+1
mat_embed=np.zeros((num_words,300))

In [31]:
mat_embed.shape

(1069118, 300)

In [32]:
oov = np.random.uniform(-1,1,size=300).astype('float32')
oov = oov.reshape(1,300)

In [33]:
for word, index in tqdm(question_index.items()):    
    if index > num_words:
        continue
    emb_vec=embedding_kv.get(word)
    if emb_vec is not None:
        mat_embed[index]=emb_vec
    else:
        mat_embed[index]=oov

100%|██████████| 1069117/1069117 [00:02<00:00, 512067.09it/s]


In [34]:
question_pad.shape

(1069272, 55)

In [35]:
np.save('embedding_v11.npy', mat_embed)
np.save('text_pad_v11.npy', question_pad)

In [37]:
train=question_pad[:train_data.shape[0]]
test=question_pad[train_data.shape[0]:]

X_train,X_test,y_train,y_test=train_test_split(train,train_data['target'].values,test_size=0.2,stratify=train_data['target'].values,random_state=40)

print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (554772, 55)
Shape of Validation  (138694, 55)


In [38]:
import keras.backend as K
from sklearn.metrics import f1_score

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [41]:
no_of_classes = len(train_data['target'].value_counts())
value_count_array = train_data.target.value_counts().to_numpy()
sample_size = train_data.target.value_counts().sum()
w_class = sample_size/(no_of_classes*value_count_array)
class_weight_dict = dict(zip(range(len(train_data['target'].value_counts())), w_class))

In [42]:
class_weight_dict

{0: 0.5659505497375362, 1: 4.290718970424453}

In [39]:
from keras.layers import Flatten, Input, concatenate, GlobalMaxPooling1D
from keras.models import Model

vec_input = Input(shape=(55,))
embedding = Embedding(num_words,300,embeddings_initializer=Constant(mat_embed),
                   input_length=55,trainable=False)(vec_input)
bidir1 = (LSTM(128, dropout=0.2, recurrent_dropout=0.15, return_sequences=True))(embedding)
bidir2 = (LSTM(128, dropout=0.2, recurrent_dropout=0.15, return_sequences=True))(bidir1)

x = concatenate([bidir1, bidir2])
x = GlobalMaxPooling1D()(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=vec_input, outputs=output)

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',f1_m])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 55)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 55, 300)      320735400   ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 55, 128)      219648      ['embedding[0][0]']              
                                                                                                  
 lstm_1 (LSTM)                  (None, 55, 128)      131584      ['lstm[0][0]']                   
                                                                                              

In [None]:
history=model.fit(X_train[:100000],y_train[:100000],
                  batch_size=128,
                  epochs=5,
                  validation_data=(X_test[:100000],y_test[:100000]),
                  class_weight=class_weight_dict)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
