# **Quora Insincere Questions Classification**

using Deep learning and NLP

In [None]:
import numpy as np
import pandas as pd
import re
import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
import keras
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,Dropout,Conv1D,MaxPooling1D,Dense
from keras import callbacks 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.offline as py
import plotly.graph_objs as go

In [None]:
train_data = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [None]:
test_data.head()

### **Check for missing values:**

In [None]:
train_data.isnull().sum()

### **Check for empty strings:**

In [None]:
blanks = []

for i, qid, question_text, target in train_data.itertuples():
    if type(question_text)==str:
        if question_text.isspace():
            blanks.append(i)
        
print(len(blanks), 'blanks: ', blanks)

In [None]:
train_data.head()

In [None]:
print("Train set shape : ",train_data.shape)
print("Test set shape : ",test_data.shape)

In [None]:
# no. of observations with distinct targets
count_targets = train_data['target'].value_counts()

# setting up the above results in form of a bar chart using python graph objects module
trace = go.Bar(x = count_targets.index, y = count_targets, marker = dict(color = count_targets.values))
# setting up parameters for layout of the bar chart 
layout = go.Layout(title = 'Target counts', font = dict(size=12))

data = [trace] 
fig = go.Figure(data = data, layout = layout) # inserting defined traces and layout as parameters of the plotly figure method
py.iplot(fig, filename = "TargetCount") # Plotting the bar chart


# Further, plotting the observations for each class in form of a pie chart

labels = (np.array(count_targets.index)) # defining the targets of the dataset in the labels object
# defining the proportions of count of each target out of total count
proportions = (np.array((count_targets/count_targets.sum())*100)) 

# setting up our results as parameters in the trace object i.e. the data to plot
trace = go.Pie(labels = labels, values = proportions)
layout = go.Layout(                       
    title = "Target proportion pie",     # pie chart layout specifications 
    font = dict(size = 12),
    width = 600,
    height = 600)

data = [trace]
fig = go.Figure(data = data, layout = layout) 
py.iplot(fig, filename = "usertype")  # Plotting the pie chart

### Clean Text

In [None]:
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

nlp.max_length = 16981599

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', 
          "'",  '&', '/', '[', ']', '>', '<', '%', '=', '#', '+', 
          '\\',  '§', '″', '′','¿','═', '$', '^', '*', '@', '^', '_', 
          '`', '{', '}', '~']

In [None]:
def replace_numbers(sentence):
    # Replace numbers by hash characters
    sentence = re.sub('[0-9]{5,}', '#####', sentence)
    sentence = re.sub('[0-9]{4}', '####', sentence)
    sentence = re.sub('[0-9]{3}', '###', sentence)
    sentence = re.sub('[0-9]{2}', '##', sentence)
    return sentence

## Stemming

In [None]:
s_stemmer = SnowballStemmer(language='english')

## Stop words

In [None]:
stop_words = nlp.Defaults.stop_words

print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

### Spliting data

In [None]:
train_input = train_data['question_text'].apply(replace_numbers)
train_label = train_data['target']

test_input  = test_data['question_text'].apply(replace_numbers)

train_input_rsw_punc_stem = [[s_stemmer.stem(token.text.lower()) for token in nlp(question) if token.text not in puncts and token.text not in stop_words] for question in train_input]

test_input_rsw_punc_stem  = [[s_stemmer.stem(token.text.lower()) for token in nlp(question) if token.text not in puncts and token.text not in stop_words] for question in test_input]

In [None]:
print(train_label.value_counts())

In [None]:
train_input[0]

In [None]:
train_input_rsw_punc_stem[0]

In [None]:
test_input_rsw_punc_stem[0]

# KerasX

### Keras Tokenization

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_input_rsw_punc_stem)
sequences = tokenizer.texts_to_sequences(train_input_rsw_punc_stem)

# tokenizer_test = Tokenizer()
# tokenizer_test.fit_on_texts(test_input_rsw_punc_stem)
# sequences_test = tokenizer_test.texts_to_sequences(test_input_rsw_punc_stem)

tokenizer.fit_on_texts(test_input_rsw_punc_stem)
sequences_test = tokenizer.texts_to_sequences(test_input_rsw_punc_stem)

## Padding

In [None]:
max_length = 55

In [None]:
train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

test_input_padded = pad_sequences(sequences_test, maxlen=max_length, padding='post')

print(train_input_padded.shape)
print(test_input_padded.shape)

In [None]:
tokenizer.index_word

In [None]:
tokenizer.word_counts

In [None]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

### Convert to Numpy Matrix

In [None]:
train_input_padded = np.array(train_input_padded)

test_input_padded = np.array(test_input_padded)

In [None]:
train_input_padded

In [None]:
test_input_padded

In [None]:
### splitting dataset in to train_set and val_set
x_train,x_val,y_train,y_val = train_test_split(train_input_padded, train_label, test_size=0.1, random_state=42)

# Creating an LSTM based model

In [None]:
def create_model(vocabulary_size, max_length):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 300, input_length=max_length, trainable=True))
    model.add(Bidirectional(LSTM(256,return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Conv1D(100,5,activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(128))
    model.add(Dropout(0.4))
    model.add(Dense(1,activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [None]:
model = create_model(vocabulary_size + 1, max_length)

## Fit (Train) the Model

In [None]:
es = callbacks.EarlyStopping(patience=2)
mc = callbacks.ModelCheckpoint('./w.h5', save_best_only=True, save_weights_only=True)

history = model.fit(x_train, y_train, epochs=128, validation_data=(x_val,y_val),batch_size=1024, verbose=1,callbacks=[es, mc])

In [None]:
def plot_loss_acc(history_dict):
    acc = history_dict['accuracy']
    val_acc = history_dict['val_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, loss, 'r', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    plt.plot(epochs, acc, 'r', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim((0.5,1))
    plt.show()
    
plot_loss_acc(history.history)

# Evaluating Model Performance

In [None]:
model.evaluate(x_val,y_val)

In [None]:
model.load_weights('./w.h5')
model.evaluate(x_val,y_val)

In [None]:
predictions = model.predict_classes(x_val)

In [None]:
predictions

In [None]:
confusion_matrix(y_val,predictions)

In [None]:
print(classification_report(y_val,predictions))

In [None]:
predictions_test = model.predict_classes(test_input_padded)

In [None]:
predictions_test.shape

In [None]:
test_y = (predictions_test>0.9).astype(int)

In [None]:
test_y.shape

In [None]:
submission = pd.DataFrame({"qid":test_data["qid"].values})
submission["prediction"] = test_y

In [None]:
submission["prediction"].value_counts()

In [None]:
submission.to_csv("submission.csv", index=False)