In [None]:
import zipfile
import pandas as pd
import numpy as np
import operator 
import re
import gc
from keras import layers
import os
import keras
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional, concatenate, Input, Dropout
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense,Flatten,Conv2D,Conv1D,GlobalMaxPooling1D,Concatenate, TimeDistributed
from keras.optimizers import Adam
from keras import optimizers, callbacks 
from sklearn.metrics import log_loss,f1_score

for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")


In [None]:
train_df.head()

In [None]:
with zipfile.ZipFile("../input/quora-insincere-questions-classification/embeddings.zip","r") as z:
    z.extractall(".")


Function for extracting embeddings files

In [None]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

In [None]:
glove = './glove.840B.300d/glove.840B.300d.txt'
embed_glove = load_embed(glove)

**Building the vocabulary of our dataset**

In [None]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab


**Checking the coverage of embeddings to our vocabulary**

In [None]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words


1 - Checking the coverage without any modifcation to data

In [None]:
vocabulary = build_vocab(train_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)

In [None]:
uncovered[:10]

1-1 words with ponctuation

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', 
          "'",  '&', '/', '[', ']', '>', '<', '%', '=', '#', '+', 
          '\\',  '§', '″', '′','¿','═']


def preprocessing1(text):
    text = text.lower()
    for punct in puncts:
        text = text.replace(punct, f' {punct} ')
    return text
    

In [None]:
train_df.question_text = train_df.question_text.apply(preprocessing1)

In [None]:
vocabulary = build_vocab(train_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)

In [None]:
uncovered[:10]

1-2 Contraction words

In [None]:
contraction_map = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                    "could've": "could have", "couldn't": "could not", "didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will",
                   "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                   "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", 
                    "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am",
                   "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                   "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                   "mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                   "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                    "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
                   "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                   "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                   "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've":
                   "should have", "shouldn't": "should not", "shouldn't've": "should not have", 
                   "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would",                        
                   "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                    "there'd've": "there would have", "there's": "there is", "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have",
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not",                       
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                    "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                   "will've": "will have", "won't": "will not", "won't've": "will not have",
                   "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                   "y'all're": "you all are","y'all've": "you all have","you'd": "you would",
                   "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                   "you're": "you are", "you've": "you have" }


In [None]:
def preprocessing2(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [None]:
train_df.question_text = train_df.question_text.apply(lambda x: preprocessing2(x, contraction_map))

In [None]:
vocabulary = build_vocab(train_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)

In [None]:
uncovered[0:20]

1-3 special characters & espace around ponctuation

In [None]:
special_map = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x",
               "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"',
               "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a',
               '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi','\u200b': ' ', '…': ' ... ', '\ufeff': '',
               'करना': '', 'है': '' }
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [None]:
def preprocessing3(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])  
    for p in punct:
        text = text.replace(p, f' {p} ')   
    return text

In [None]:
train_df.question_text = train_df.question_text.apply(lambda x: preprocessing3(x, punct, special_map))

In [None]:
vocabulary = build_vocab(train_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)

In [None]:
uncovered[:100]

1-3 miss spelling words

In [None]:
mispell_dict = {'pubg':'video game','fortnite':'video game','redmi':'phone mark','brexit':'britain exit',
                'cryptocurrencies':'crypto currencies','pokémon':'video game','laravel':'framework',
                'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 
                'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do',
                'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 
                'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
                'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', 
                '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
                'demonitization': 'demonetization', 'demonetisation': 'demonetization'}


In [None]:
def preprocessing4(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x


In [None]:
train_df.question_text = train_df.question_text.apply(lambda x: preprocessing4(x, mispell_dict))

In [None]:
vocabulary = build_vocab(train_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)

In [None]:
len_voc = 95000
max_len = 60

2 Tokenizing + padding

In [None]:
def make_data(X):
    t = Tokenizer(num_words=len_voc)
    t.fit_on_texts(X)
    X = t.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=max_len)
    return X, t.word_index, t

In [None]:
X, word_index, t = make_data(train_df['question_text'])
Y = train_df['target'].values
len_voc = len(word_index)
del train_df
len_voc

In [None]:
len_voc

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 420, test_size = 0.1)

4 Embeddings

In [None]:
def make_embed_matrix(embeddings_index, word_index, len_voc):
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_index
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len_voc, embed_size))
    
    for word, i in word_index.items():
        if i >= len_voc: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [None]:
embedding = make_embed_matrix(embed_glove, word_index,len_voc)
del word_index
gc.collect()

In [None]:
embedding[0].shape

In [None]:
embedding.shape

In [None]:
from keras import backend as K


In [None]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
from tensorflow.compat.v1.keras.layers import CuDNNGRU
from keras.layers import GlobalAveragePooling1D, GlobalMaxPool1D
from keras.layers import Embedding,Bidirectional,LSTM,Dropout,Conv1D,MaxPooling1D,Dense



In [None]:
def make_model(embedding_matrix, embed_size=300,loss='binary_crossentropy') :
    model = Sequential()
    model.add(Embedding(len_voc, embed_size , weights=[embedding_matrix], trainable=False))
    model.add(LSTM(128)) 
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.2)) 
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer='adam' ,metrics=['accuracy',f1])
    return model


In [None]:
modelTest= make_model(embedding)

In [None]:
modelTest.summary()

Model creation

In [None]:
WEIGHTS_PATH = './w0.h5'
mc = callbacks.ModelCheckpoint( filepath=WEIGHTS_PATH, monitor='val_loss', mode='min', save_best_only=True )
es = callbacks.EarlyStopping( patience=5 )


In [None]:
history = modelTest.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test) , callbacks=[es , mc] , batch_size=2048 )
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.legend( ['test', 'train'] )

In [None]:
test_pred = modelTest.predict(X_test, batch_size=512, verbose=1)

testing our model befor submitting

In [None]:
from sklearn.metrics import f1_score


In [None]:
def tweak_threshold(pred, truth):
    thresholds = []
    scores = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        thresholds.append(thresh)
        score = f1_score(truth, (pred>thresh).astype(int))
        scores.append(score)
    return np.max(scores), thresholds[np.argmax(scores)]

In [None]:
score_val, threshold_val = tweak_threshold(test_pred, y_test)

print(f"Scored {round(score_val, 4)} for threshold {threshold_val} with untreated texts on validation data")


Submission

In [None]:
test_df  = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")

In [None]:
test_df

In [None]:
vocabulary = build_vocab(test_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)    

In [None]:
test_df.question_text = test_df.question_text.apply(preprocessing1)
test_df.question_text = test_df.question_text.apply(lambda x: preprocessing2(x, contraction_map))
test_df.question_text = test_df.question_text.apply(lambda x: preprocessing3(x, punct, special_map))
test_df.question_text = test_df.question_text.apply(lambda x: preprocessing4(x, mispell_dict))

In [None]:
vocabulary = build_vocab(test_df['question_text'])
uncovered = check_coverage(vocabulary,embed_glove)    

In [None]:
test_X = test_df["question_text"].fillna("_na_").values
test_X = t.texts_to_sequences(test_X)
test_X = pad_sequences(test_X, maxlen=max_len)


In [None]:
test_X

In [None]:
test_X.shape

In [None]:
pred_val = modelTest.predict(test_X, batch_size=512, verbose=1)

In [None]:
test_y = (pred_val>0.34).astype(int)

In [None]:
test_y.shape

In [None]:
test_y

In [None]:
(unique, counts) = np.unique(test_y, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies

In [None]:
submission = pd.DataFrame({"qid":test_df["qid"].values})
submission['prediction'] = test_y


In [None]:
submission

In [None]:
submission.to_csv("submission.csv", index=False)