In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sample_submission_filepath = "/kaggle/input/quora-insincere-questions-classification/sample_submission.csv"
embeddings_zippath = "/kaggle/input/quora-insincere-questions-classification/embeddings.zip"
train_csv_path = "/kaggle/input/quora-insincere-questions-classification/train.csv"
test_csv_path = "/kaggle/input/quora-insincere-questions-classification/test.csv"

In [None]:
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

We have four different types of embeddings.

* GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
* glove.840B.300d - https://nlp.stanford.edu/projects/glove/
* paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
* wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html

A very good explanation for different types of embeddings are given in this kernel. Please refer the same for more details..

# Glove Embeddings:

    In this section, let us use the Glove embeddings and rebuild the GRU model.

In [None]:
# unzip file.zip -d destination_folder
!unzip /kaggle/input/quora-insincere-questions-classification/embeddings.zip -d /kaggle/working/embeddings

In [None]:
!ls /kaggle/working/

In [None]:
# embeddings_list_available = [c.strip() for c in """glove.840B.300d/glove.840B.300d.txt  
# GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
# wiki-news-300d-1M/wiki-news-300d-1M.vec  
# paragram_300_sl999/README.txt  
# paragram_300_sl999/paragram_300_sl999.txt """.split('\n')]

In [None]:
embeddings_unzip_path = """  /kaggle/working/embeddings/glove.840B.300d/glove.840B.300d.txt  
  /kaggle/working/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  /kaggle/working/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec  
  /kaggle/working/embeddings/paragram_300_sl999/README.txt  
  /kaggle/working/embeddings/paragram_300_sl999/paragram_300_sl999.txt """.split("\n")

In [None]:
embeddings_list_available = [c.strip() for c in embeddings_unzip_path]

In [None]:
embeddings_list_available

In [None]:
embeddings_available_dict = {}
for idx, c in enumerate(embeddings_list_available):
    print(f"Embedings: {idx}. {c}")
    embeddings_available_dict[idx] = c

In [None]:
embeddings_available_dict

In [None]:
EMBEDDING_FILE = embeddings_available_dict[0]
print(EMBEDDING_FILE)

In [None]:
%%time
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

In [None]:
embeddings_index

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [None]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

In [None]:
train_df

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
train_df["question_text"][0]

In [None]:
train_X[0]

In [None]:
word_index = tokenizer.word_index

In [None]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

In [None]:
emb_mean#embedding_matrix.shape

In [None]:
embedding_matrix

In [None]:
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

In [None]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

In [None]:
test_X.shape

In [None]:
test_df.shape

In [None]:
df_sub = pd.read_csv(sample_submission_filepath)

In [None]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
df_sub['prediction'] = (pred_glove_test_y>0.5).astype('int')

In [None]:
df_sub.to_csv("submission.csv", index=False)