Most work in this notebook has been picked up Sudalai Rajkumar's kernels on [embeddings](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings) and [exploration](https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-qiqc). I have just tried to make them more beginner friendly. I'll be putting more detailed kernels around preprocessing on the same problem, picking up ideas from some other kernels on this problem and some of my own. 

Happy Learning!!

#### This notebook is a work in progress!

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import matplotlib
import cufflinks as cf
import plotly
import plotly.offline as py
import plotly.graph_objs as go
from tqdm import tqdm
tqdm.pandas()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Data Wrangling and Exploration

In [None]:
!unzip ../input/quora-insincere-questions-classification/embeddings.zip

In [None]:
train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [None]:
print("Train set shape : ",train.shape)
print("Test set shape : ",test.shape)

In [None]:
train.head()

In [None]:
# no. of observations with distinct targets
count_targets = train['target'].value_counts()

# setting up the above results in form of a bar chart using python graph objects module
trace = go.Bar(x = count_targets.index, y = count_targets, marker = dict(color = count_targets.values))
# setting up parameters for layout of the bar chart 
layout = go.Layout(title = 'Target counts', font = dict(size=12))

data = [trace] 
fig = go.Figure(data = data, layout = layout) # inserting defined traces and layout as parameters of the plotly figure method
py.iplot(fig, filename = "TargetCount") # Plotting the bar chart


# Further, plotting the observations for each class in form of a pie chart

labels = (np.array(count_targets.index)) # defining the targets of the dataset in the labels object
# defining the proportions of count of each target out of total count
proportions = (np.array((count_targets/count_targets.sum())*100)) 

# setting up our results as parameters in the trace object i.e. the data to plot
trace = go.Pie(labels = labels, values = proportions)
layout = go.Layout(                       
    title = "Target proportion pie",     # pie chart layout specifications 
    font = dict(size = 12),
    width = 600,
    height = 600)

data = [trace]
fig = go.Figure(data = data, layout = layout) 
py.iplot(fig, filename = "usertype")  # Plotting the pie chart

Our dataset is highly imbalanced.

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# Number of words in sentences of the text 
train['num_words'] = train['question_text'].apply(lambda x: len(str(x).split()))
test['num_words'] = test['question_text'].apply(lambda x: len(str(x).split()))

In [None]:
maximum,mean = train['num_words'].max(),train['num_words'].mean()

In [None]:
print("Maximum no. of words : ",maximum)
print("Average no. of words : {:.2f}".format(mean))

In [None]:
train['num_words'].quantile([0.25,0.5,0.75,0.99])

In [None]:
# Number of unique words in sentences of the text 
train['num_unique_words'] = train['question_text'].apply(lambda x: len(set(str(x).split()))) #set only holds unique
test['num_unique_words'] = test['question_text'].apply(lambda x: len(set(str(x).split())))   # values

In [None]:
maximum_unique,mean_unique = train['num_unique_words'].max(),train['num_unique_words'].mean()

In [None]:
print("Maximum no. of words : ",maximum_unique)
print("Average no. of words : {:.2f}".format(mean_unique))

In [None]:
train['num_unique_words'].quantile([0.25,0.5,0.75,0.99])

In [None]:
#Truncating the no. of words since that large a number is very rare in the dataset
train['num_words'].loc[train['num_words']>50] = 50 
train['num_unique_words'].loc[train['num_unique_words']>50] = 50

In [None]:
f,axes = plt.subplots(2,1,figsize = (10,10))

sns.boxplot(x = 'target', y = 'num_words', data = train, ax = axes[0])
axes[0].set_xlabel('Target', fontsize = 10)
axes[0].set_title('Number of words in each class', fontsize = 12)

sns.boxplot(x = 'target', y = 'num_unique_words', data = train, ax = axes[1])
axes[1].set_xlabel('Target', fontsize = 10)
axes[1].set_title('Number of unique words in each class', fontsize = 12)


Ideas around Visualization using plotly have been taken from this amazing [kernel](https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-qiqc) by Sudalai Rajakumar(@srk)

# Model Building

In [None]:
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Dense, Activation, GRU, Conv1D, Activation, Dropout, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
# 85-15 train, validation split
train_df, val_df = train_test_split(train, test_size=0.15, random_state=1000)

In [None]:
# size of every word vector
embed_size = 300

# number of unique words to use 
max_features = 50000

# maximum number of words in a question. This will be our input size
maxlen = 100

In [None]:
# Filling missing values in the text columns if any
train_X = train_df['question_text'].fillna("_na_").values
val_X = val_df['question_text'].fillna("_na_").values
test_X = test['question_text'].fillna("_na_").values

In [None]:
# Tokenizing words in our sentences using keras tokenizer
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_X))

In [None]:
# converting each text in the dataset to a sequence of integers
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [None]:
# Padding sequences 
# Rememeber : The maximum length of our input can not be greater than 100 so we need to pad the incoming sequences at 100

train_X = pad_sequences(train_X, maxlen = maxlen)
val_X = pad_sequences(val_X, maxlen = maxlen)
test_X = pad_sequences(test_X, maxlen = maxlen)

In [None]:
#Target values
train_y = train_df['target'].values
val_y = val_df['target'].values

### GloVe 

In [None]:
embeddings = 'glove.840B.300d/glove.840B.300d.txt'

def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype = 'float32')

In [None]:
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embeddings))

In [None]:
embs = np.stack(embeddings_index.values())
embs_mean,embs_std = embs.mean(),embs.std()
embs_shape = embs.shape[1]

In [None]:
word_index = tokenizer.word_index #dictionary of tokens of words in our input text
nb_words = min(max_features,len(word_index)) #number of words
embedding_matrix = np.random.normal(embs_mean, embs_std, (nb_words,embed_size)) #emedding matrix of shape(50000,300)

In [None]:
for word,i in word_index.items():
    if i >= max_features:
        continue
        embeddings_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embeddings_vector

### Building the Network

In [None]:
inp = Input(shape = (maxlen,)) #input layer
x = Embedding(max_features,embed_size,weights = [embedding_matrix])(inp) #embedding layer
x = Bidirectional(GRU(64, return_sequences = True))(x) #bidirectional GRU layer
x = GlobalMaxPool1D()(x) #layer that yields the maximum input as the output 
x = Dense(16, activation = 'relu')(x) #dense layer with relu activation
x = Dropout(0.1)(x)
x = Dense(1, activation = 'sigmoid')(x) #dense layer with sigmoid activation
model = Model(inputs = inp, outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy')
print(model.summary())

Details on the binary cross entropy and thresholds in this [paper](https://arxiv.org/abs/1402.1892).

In [None]:
# Training the model in batch sizes of 1024 for 5 iterations
model.fit(train_X, train_y, batch_size=1024, epochs=5, validation_data=(val_X, val_y))

In [None]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

In [None]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)