In [None]:
# !pip install tensorflow -q --upgrade
# !pip install numpy -q --upgrade
# !pip install pandas -q --upgrade
# !pip install sklearn -q --upgrade

In [None]:
import tensorflow as tf
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.manifold import TSNE
import re

In [None]:
sample_submission = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/sample_submission.csv")
test = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")
train = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")

**Pre processing**

In [None]:
def clean_text(text):
    text = text.translate(string.punctuation)
    
    text = text.lower().split()
    
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

train['question_text'] = train['question_text'].map(lambda x: clean_text(x))

In [None]:
vocabulary_size = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(train['question_text'])
sequences = tokenizer.texts_to_sequences(train['question_text'])
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50)

**Embeddings**

In [None]:
import zipfile
with zipfile.ZipFile("/kaggle/input/quora-insincere-questions-classification/embeddings.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [None]:
embeddings_index = {}
f = open('/kaggle/working/glove.840B.300d/glove.840B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
model_glove = tf.keras.models.Sequential()
model_glove.add(tf.keras.layers.Embedding(vocabulary_size, 
                                          300, 
                                          input_length=50, 
                                          weights=[embedding_matrix], 
                                          trainable=False))
model_glove.add(tf.keras.layers.Dropout(0.2))
model_glove.add(tf.keras.layers.Conv1D(64, 
                                       5, 
                                       activation='relu'))
model_glove.add(tf.keras.layers.MaxPooling1D(pool_size=4))
model_glove.add(tf.keras.layers.LSTM(100))
model_glove.add(tf.keras.layers.Dense(1, 
                                      activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', 
                    optimizer='adam', 
                    metrics=['accuracy'])
## Fit train data
model_glove.fit(data, 
                np.array(train["target"]), 
                validation_split=0.4, 
                epochs = 1)

**Predicting**

In [None]:
test['question_text'] = test['question_text'].map(lambda x: clean_text(x))
sequences = tokenizer.texts_to_sequences(test['question_text'])
test_data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50)

In [None]:
preds = (model_glove.predict(test_data) > 0.35).astype("int32")

In [None]:
preds_df = pd.DataFrame({"qid": test.qid, "prediction": preds.flatten()})
preds_df.to_csv("submission.csv",index=False)