# **Problem**

If a question found on Quora is consider as a content appropriet Predictions should only be the integers 0, and if not Predictions should only be the integers 1.

# **Import Libraries**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # librarie qui affiche les graphes
import seaborn as sns # Python data visualization library based on matplotlib
import re
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import string
from string import punctuation
from sklearn.model_selection import train_test_split

# Load Train and Test Data-set

In [None]:
#Reading Data
df_train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
df_test = pd.read_csv('../input/quora-insincere-questions-classification/test.csv') 

# File descriptions
* train.csv — the training set
* test.csv — the test set
* embeddings: glove

# Data Fields
* qid — unique question identifier
* question_text — Quora question text
* target — a question labeled “insincere” has a value of 1, otherwise 0

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.isnull().sum())

In [None]:
print(df_test.isnull().sum())

In [None]:
print(len(df_train.question_text[df_train['target'] == 0]) /      len(df_train['question_text']) * 100,'percent of sincere')
print(len(df_train.question_text[df_train['target'] == 1]) / len(df_train['question_text']) * 100,'percent of insincere')

* 93.81298224821265 percent of sincere

* 6.187017751787352 percent of insincere

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
sns.countplot(df_train.target, palette=['red', 'blue'], ax=ax)
plt.show()

# # 1. DATA Preprocessing (Cleaning)

The text data is not entirely clean, thus we need to apply some data preprocessing techniques.

# Convert questions to lower case

In [None]:
#Convert to lower case
def lower1(text):
    
    text = text.lower()
   
       
    return text

df_train["question_text"] = df_train["question_text"].apply(lower1)
df_test["question_text"] = df_test["question_text"].apply(lower1)

df_train.head()

# Removing Punctuation

In [None]:
#Removing Punctuation
puncts=[',', '.', '“', ':', ')', '(', '-', '!', '|', ';', '\'', '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', 
 '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 
 '█', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '¬', '░', '¡', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
 ' — ', '‹', '─', '▒', '：', '⊕', '▼', '▪', '†', '■', '\'', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '⋅', '\'', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '・', '╦', '╣', '╔', '╗', '▬', '❤', '≤', '‡', '√', '◄', '━', 
 '⇒', '▶', '≥', '╝', '♡', '◊', '。', '✈', '≡', '☺', '✔', '≈', '✓', '♣', '☎', '℃', '◦', '└', '‟', '～', '！', '○', 
 '◆', '№', '♠', '▌', '✿', '▸', '⁄', '□', '❖', '✦', '．', '÷', '｜', '┃', '／', '￥', '╠', '↩', '✭', '▐', '☼', '☻', '┐', 
 '├', '«', '∼', '┌', '℉', '☮', '฿', '≦', '♬', '✧', '〉', '－', '⌂', '✖', '･', '◕', '※', '‖', '◀', '‰', '\x97', '↺', 
 '∆', '┘', '┬', '╬', '،', '⌘', '⊂', '＞', '〈', '⎙', '☠', '⇐', '▫', '∗', '∈', '≠', '♀', '♔', '˚', '℗', '┗', '＊', 
 '┼', '❀', '＆', '∩', '♂', '‿', '∑', '‣', '➜', '┛', '⇓', '☯', '⊖', '☀', '┳', '；', '∇', '⇑', '✰', '◇', '♯', '☞', '´', 
 '↔', '┏', '｡', '◘', '∂', '✌', '♭', '┣', '┴', '┓', '✨', '\xa0', '˜', '❥', '┫', '℠', '✒', '［', '∫', '\x93', '≧', '］', 
 '\x94', '∀', '♛', '\x96', '∨', '◎', '↻', '⇩', '＜', '≫', '✩', '✪', '♕', '؟', '₤', '☛', '╮', '␊', '＋', '┈', '％', 
 '╋', '▽', '⇨', '┻', '⊗', '￡', '।', '▂', '✯', '▇', '＿', '➤', '✞', '＝', '▷', '△', '◙', '▅', '✝', '∧', '␉', '☭', 
 '┊', '╯', '☾', '➔', '∴', '\x92', '▃', '↳', '＾', '׳', '➢', '╭', '➡', '＠', '⊙', '☢', '˝', '∏', '„', '∥', '❝', '☐', 
 '▆', '╱', '⋙', '๏', '☁', '⇔', '▔', '\x91', '➚', '◡', '╰', '\x85', '♢', '˙', '۞', '✘', '✮', '☑', '⋆', 'ⓘ', '❒', 
 '☣', '✉', '⌊', '➠', '∣', '❑', '◢', 'ⓒ', '\x80', '〒', '∕', '▮', '⦿', '✫', '✚', '⋯', '♩', '☂', '❞', '‗', '܂', '☜', 
 '‾', '✜', '╲', '∘', '⟩', '＼', '⟨', '·', '✗', '♚', '∅', 'ⓔ', '◣', '͡', '‛', '❦', '◠', '✄', '❄', '∃', '␣', '≪', '｢', 
 '≅', '◯', '☽', '∎', '｣', '❧', '̅', 'ⓐ', '↘', '⚓', '▣', '˘', '∪', '⇢', '✍', '⊥', '＃', '⎯', '↠', '۩', '☰', '◥', 
 '⊆', '✽', '⚡', '↪', '❁', '☹', '◼', '☃', '◤', '❏', 'ⓢ', '⊱', '➝', '̣', '✡', '∠', '｀', '▴', '┤', '∝', '♏', 'ⓐ', 
 '✎', ';', '␤', '＇', '❣', '✂', '✤', 'ⓞ', '☪', '✴', '⌒', '˛', '♒', '＄', '✶', '▻', 'ⓔ', '◌', '◈', '❚', '❂', '￦', 
 '◉', '╜', '̃', '✱', '╖', '❉', 'ⓡ', '↗', 'ⓣ', '♻', '➽', '׀', '✲', '✬', '☉', '▉', '≒', '☥', '⌐', '♨', '✕', 'ⓝ', 
 '⊰', '❘', '＂', '⇧', '̵', '➪', '▁', '▏', '⊃', 'ⓛ', '‚', '♰', '́', '✏', '⏑', '̶', 'ⓢ', '⩾', '￠', '❍', '≃', '⋰', '♋', 
 '､', '̂', '❋', '✳', 'ⓤ', '╤', '▕', '⌣', '✸', '℮', '⁺', '▨', '╨', 'ⓥ', '♈', '❃', '☝', '✻', '⊇', '≻', '♘', '♞', 
 '◂', '✟', '⌠', '✠', '☚', '✥', '❊', 'ⓒ', '⌈', '❅', 'ⓡ', '♧', 'ⓞ', '▭', '❱', 'ⓣ', '∟', '☕', '♺', '∵', '⍝', 'ⓑ', 
 '✵', '✣', '٭', '♆', 'ⓘ', '∶', '⚜', '◞', '்', '✹', '➥', '↕', '̳', '∷', '✋', '➧', '∋', '̿', 'ͧ', '┅', '⥤', '⬆', '⋱', 
 '☄', '↖', '⋮', '۔', '♌', 'ⓛ', '╕', '♓', '❯', '♍', '▋', '✺', '⭐', '✾', '♊', '➣', '▿', 'ⓑ', '♉', '⏠', '◾', '▹', 
 '⩽', '↦', '╥', '⍵', '⌋', '։', '➨', '∮', '⇥', 'ⓗ', 'ⓓ', '⁻', '⎝', '⌥', '⌉', '◔', '◑', '✼', '♎', '♐', '╪', '⊚', 
 '☒', '⇤', 'ⓜ', '⎠', '◐', '⚠', '╞', '◗', '⎕', 'ⓨ', '☟', 'ⓟ', '♟', '❈', '↬', 'ⓓ', '◻', '♮', '❙', '♤', '∉', '؛', 
 '⁂', 'ⓝ', '־', '♑', '╫', '╓', '╳', '⬅', '☔', '☸', '┄', '╧', '׃', '⎢', '❆', '⋄', '⚫', '̏', '☏', '➞', '͂', '␙', 
 'ⓤ', '◟', '̊', '⚐', '✙', '↙', '̾', '℘', '✷', '⍺', '❌', '⊢', '▵', '✅', 'ⓖ', '☨', '▰', '╡', 'ⓜ', '☤', '∽', '╘', 
 '˹', '↨', '♙', '⬇', '♱', '⌡', '⠀', '╛', '❕', '┉', 'ⓟ', '̀', '♖', 'ⓚ', '┆', '⎜', '◜', '⚾', '⤴', '✇', '╟', '⎛', 
 '☩', '➲', '➟', 'ⓥ', 'ⓗ', '⏝', '◃', '╢', '↯', '✆', '˃', '⍴', '❇', '⚽', '╒', '̸', '♜', '☓', '➳', '⇄', '☬', '⚑', 
 '✐', '⌃', '◅', '▢', '❐', '∊', '☈', '॥', '⎮', '▩', 'ு', '⊹', '‵', '␔', '☊', '➸', '̌', '☿', '⇉', '⊳', '╙', 'ⓦ', 
 '⇣', '｛', '̄', '↝', '⎟', '▍', '❗', '״', '΄', '▞', '◁', '⛄', '⇝', '⎪', '♁', '⇠', '☇', '✊', 'ி', '｝', '⭕', '➘', 
 '⁀', '☙', '❛', '❓', '⟲', '⇀', '≲', 'ⓕ', '⎥', '\u06dd', 'ͤ', '₋', '̱', '̎', '♝', '≳', '▙', '➭', '܀', 'ⓖ', '⇛', '▊', 
 '⇗', '̷', '⇱', '℅', 'ⓧ', '⚛', '̐', '̕', '⇌', '␀', '≌', 'ⓦ', '⊤', '̓', '☦', 'ⓕ', '▜', '➙', 'ⓨ', '⌨', '◮', '☷', 
 '◍', 'ⓚ', '≔', '⏩', '⍳', '℞', '┋', '˻', '▚', '≺', 'ْ', '▟', '➻', '̪', '⏪', '̉', '⎞', '┇', '⍟', '⇪', '▎', '⇦', '␝', 
 '⤷', '≖', '⟶', '♗', '̴', '♄', 'ͨ', '̈', '❜', '̡', '▛', '✁', '➩', 'ா', '˂', '↥', '⏎', '⎷', '̲', '➖', '↲', '⩵', '̗', '❢', 
 '≎', '⚔', '⇇', '̑', '⊿', '̖', '☍', '➹', '⥊', '⁁', '✢']

def clean_punct(x):

    for punct in puncts:
        if punct in x:
            x = x.replace(punct,'')

    return x
df_train["question_text"] = df_train["question_text"].apply(clean_punct)
df_test["question_text"] = df_test["question_text"].apply(clean_punct)

df_train.head()

In [None]:
# df_train.target

# Removing Numbers

In [None]:
#Cleaning numbers
def remove_numbers(x): 
    x = str(x)
    return re.sub(r'\d+', '', x)

df_train["question_text"] = df_train["question_text"].apply(remove_numbers)
df_test["question_text"] = df_test["question_text"].apply(remove_numbers)

df_train.head() 

# Removing Contractions

In [None]:
def remove_abbreviation(data):
        data = re.sub(r"he's", "he is", data)
        data = re.sub(r"there's", "there is", data)
        data = re.sub(r"We're", "We are", data)
        data = re.sub(r"That's", "That is", data)
        data = re.sub(r"won't", "will not", data)
        data = re.sub(r"they're", "they are", data)
        data = re.sub(r"Can't", "Cannot", data)
        data = re.sub(r"wasn't", "was not", data)
        data = re.sub(r"don\x89Ûªt", "do not", data)
        data= re.sub(r"aren't", "are not", data)
        data = re.sub(r"isn't", "is not", data)
        data = re.sub(r"What's", "What is", data)
        data = re.sub(r"haven't", "have not", data)
        data = re.sub(r"hasn't", "has not", data)
        data = re.sub(r"There's", "There is", data)
        data = re.sub(r"He's", "He is", data)
        data = re.sub(r"It's", "It is", data)
        data = re.sub(r"You're", "You are", data)
        data = re.sub(r"I'M", "I am", data)
        data = re.sub(r"shouldn't", "should not", data)
        data = re.sub(r"wouldn't", "would not", data)
        data = re.sub(r"i'm", "I am", data)
        data = re.sub(r"I\x89Ûªm", "I am", data)
        data = re.sub(r"I'm", "I am", data)
        data = re.sub(r"Isn't", "is not", data)
        data = re.sub(r"Here's", "Here is", data)
        data = re.sub(r"you've", "you have", data)
        data = re.sub(r"you\x89Ûªve", "you have", data)
        data = re.sub(r"we're", "we are", data)
        data = re.sub(r"what's", "what is", data)
        data = re.sub(r"couldn't", "could not", data)
        data = re.sub(r"we've", "we have", data)
        data = re.sub(r"it\x89Ûªs", "it is", data)
        data = re.sub(r"doesn\x89Ûªt", "does not", data)
        data = re.sub(r"It\x89Ûªs", "It is", data)
        data = re.sub(r"Here\x89Ûªs", "Here is", data)
        data = re.sub(r"who's", "who is", data)
        data = re.sub(r"I\x89Ûªve", "I have", data)
        data = re.sub(r"y'all", "you all", data)
        data = re.sub(r"can\x89Ûªt", "cannot", data)
        data = re.sub(r"would've", "would have", data)
        data = re.sub(r"it'll", "it will", data)
        data = re.sub(r"we'll", "we will", data)
        data = re.sub(r"wouldn\x89Ûªt", "would not", data)
        data = re.sub(r"We've", "We have", data)
        data = re.sub(r"he'll", "he will", data)
        data = re.sub(r"Y'all", "You all", data)
        data = re.sub(r"Weren't", "Were not", data)
        data = re.sub(r"Didn't", "Did not", data)
        data = re.sub(r"they'll", "they will", data)
        data = re.sub(r"they'd", "they would", data)
        data = re.sub(r"DON'T", "DO NOT", data)
        data = re.sub(r"That\x89Ûªs", "That is", data)
        data = re.sub(r"they've", "they have", data)
        data = re.sub(r"i'd", "I would", data)
        data = re.sub(r"should've", "should have", data)
        data = re.sub(r"You\x89Ûªre", "You are", data)
        data = re.sub(r"where's", "where is", data)
        data = re.sub(r"Don\x89Ûªt", "Do not", data)
        data = re.sub(r"we'd", "we would", data)
        data = re.sub(r"i'll", "I will", data)
        data = re.sub(r"weren't", "were not", data)
        data = re.sub(r"They're", "They are", data)
        data = re.sub(r"Can\x89Ûªt", "Cannot", data)
        data = re.sub(r"you\x89Ûªll", "you will", data)
        data = re.sub(r"I\x89Ûªd", "I would", data)
        data = re.sub(r"let's", "let us", data)
        data = re.sub(r"it's", "it is", data)
        data = re.sub(r"can't", "cannot", data)
        data = re.sub(r"dont", "do not", data)
        data = re.sub(r"don't", "do not", data)
        data = re.sub(r"you're", "you are", data)
        data = re.sub(r"i've", "I have", data)
        data = re.sub(r"that's", "that is", data)
        data = re.sub(r"i'll", "I will", data)
        data = re.sub(r"doesn't", "does not",data)
        data = re.sub(r"i'd", "I would", data)
        data = re.sub(r"didn't", "did not", data)
        data = re.sub(r"ain't", "am not", data)
        data = re.sub(r"you'll", "you will", data)
        data = re.sub(r"I've", "I have", data)
        data = re.sub(r"Don't", "do not", data)
        data = re.sub(r"I'll", "I will", data)
        data = re.sub(r"I'd", "I would", data)
        data = re.sub(r"Let's", "Let us", data)
        data = re.sub(r"you'd", "You would", data)
        data = re.sub(r"It's", "It is", data)
        data = re.sub(r"Ain't", "am not", data)
        data = re.sub(r"Haven't", "Have not", data)
        data = re.sub(r"Could've", "Could have", data)
        data = re.sub(r"youve", "you have", data)  
        data = re.sub(r"donå«t", "do not", data)

        return data
    
df_train["question_text"] = df_train["question_text"].apply(remove_abbreviation)
df_test["question_text"] = df_test["question_text"].apply(remove_abbreviation)
df_train.to_csv("df_train.csv", index = False)
df_test.to_csv("df_test.csv", index = False)
df_train.head()

# remove stopwords
Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc.

In [None]:
# remove stopwords function 
def remove_stopwords(x):
    stopword_list = set(stopwords.words('english'))
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(x)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

df_train["question_text"] = df_train["question_text"].apply(remove_stopwords)
df_test["question_text"] = df_test["question_text"].apply(remove_stopwords)

df_train.head()

#  Lemmatization

the verbes with the form to be as 'to walk' may appear as 'walk', 'walked', 'walks', 'walking'. The base form, 'walk', that one might look up in a dictionary, is called the lemma for the word.

In [None]:
#Lemmatization
def lemma_text(x):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(x)
    tokens = [token.strip() for token in tokens]
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df_train["question_text"] = df_train["question_text"].apply(lemma_text)
df_test["question_text"] = df_test["question_text"].apply(lemma_text)



df_train.head()

In [None]:
import zipfile
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras import callbacks
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


# **Split the dataset into train, validation**
* 80% for trainig data
* 20% for validation (test) data

In [None]:
df_train, val_df = train_test_split(df_train, test_size=0.2, random_state=42)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
###   maxlen : maximum length of all sequences and shorter sequences are padded with zeros
maxlen = 100 # max number of words in a question to use


## fill up the missing values
train_X =  df_train["question_text"].values
val_X = val_df["question_text"].values
test_X = df_test["question_text"].values


##Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).
## Tokenize the sentences
### Turns questions (strings) into lists of integer indices 
### Use only unique word (not repeat word)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X)+list(val_X))
train_X_tokens = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X_tokens = tokenizer.texts_to_sequences(test_X)


## Pad the sentences 
###   Make all sequences in same length.
train_X_pad = pad_sequences(train_X_tokens, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X_pad = pad_sequences(test_X_tokens, maxlen=maxlen)

## Get the target values
train_y = df_train['target'].values
val_y = val_df['target'].values

train_X_pad

# test_X_pad
# train_X_pad

In [None]:
#train_y

# Embedding

In [None]:
with zipfile.ZipFile("../input/quora-insincere-questions-classification/embeddings.zip","r") as z:
    z.extractall(".")

In [None]:
### Glove : Is a model for distributed word representation where the distance between words 
### is related to semantic similarity
#Read the glove word vectors (space delimited strings) into a dictionary from word->vector.
EMBEDDING_FILE ='./glove.840B.300d/glove.840B.300d.txt' 


def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [None]:
word_index = tokenizer.word_index

nb_words = min(max_features, len(word_index))


### embedding_matrix : matrix which contain each line a vector with a corresponding word
###Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
#inp = Input(shape=(maxlen,))
#x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
#x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
#x = GlobalMaxPool1D()(x)
#x = Dense(50, activation="relu")(x)
#x = Dropout(0.1)(x)
#x = Dense(1, activation="sigmoid")(x)
#model = Model(inputs=inp, outputs=x)
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#print(model.summary())


# * Create a model

In [None]:
### A model is a function which gives us a near correct information (output) given that we've provided input data
from keras.models import Sequential
model = Sequential()
model.add(Embedding(max_features, embed_size , weights=[embedding_matrix], trainable=False))
model.add(LSTM(32
              ))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"
         ))

### Loss function is a measure of how well our model’s outputs match the targets which we are trying to minimize.
### Regression => mean squared error  (mse = sum(y - y(predict))²), mean absolute error (MAE)
### Classfication => categorical cross entropy (multi-class data), binary cross entropy (for binary prediction)
### In our poblem  we use  binary crossentropy 

### Optimizer : algorithm that helps our loss function reach its convergence point with minimum
### the optimization algorithm varies the parameters of the model (weights and bias ).
### This operation is repeated until we find the values of the parameters, for which the Loss function is optimal.

model.compile(loss='binary_crossentropy', optimizer='adam' ,metrics=['accuracy'])
model.summary()


In [None]:
### This callback allows us to interrupt training as soon as the model start overfitting (The validation loss starts to increase)
### After 5 epoch with no improvement (val_loss(i) to val_loss(i+5) Increase) the training will be stopped
es = callbacks.EarlyStopping( patience=5 )

### This callback lets us continually save the model during training
### Save_best_only=True, save_weights_only=True => keep the best model seen during training.
mc = callbacks.ModelCheckpoint('./w.h5', save_best_only=True, save_weights_only=True)

### Lancer l'apprentissage du  modèle pour 10000 époques (10000 itérations sur tous les échantillons dans le x_train et  y_train), en 'batch_size' de 512 échantillons.
### validation_data : Helps us to prevent overfitting
model.fit(train_X_pad, train_y, batch_size=512, epochs=20000, callbacks=[es, mc ], validation_data=(val_X, val_y))

In [None]:
#y_test = model.predict([test_X_pad], batch_size=1024, verbose=1)
#y_test

In [None]:
#sample_submission = pd.read_csv('../input/quora-insincere-questions-classification/sample_submission.csv')
#sample_submission['prediction'] = y_test
#sample_submission.to_csv('submission.csv', index=False)
#sample_submission

In [None]:
##And finally, get predictions for the test set and prepare a submission CSV:
out = model.predict(test_X_pad,batch_size=256)
out_df = pd.DataFrame({"qid":df_test["qid"].values})
out_pred = (out>0.35).astype(int)
out_df['prediction'] = out_pred
out_df.to_csv("submission.csv", index=False)
out_df