In [None]:
# MSSV: 19020063
# Name: Cao Đình Hoàng Minh

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plot some graphs
from wordcloud import WordCloud, STOPWORDS
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test_data = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")

In [None]:
train_data.info()
print("\n")
test_data.info()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
sincereQuestions = train_data[train_data['target']==0]
insincereQuestions = train_data[train_data['target']==1]
numOfSincere = sincereQuestions.shape[0]
numOfInsincere = insincereQuestions.shape[0]
print("Number Of Sincere Questions:", numOfSincere)
print("Number Of Insincere Questions:", numOfInsincere)

In [None]:
quantity = [numOfSincere, numOfInsincere]
labels = ['Sincere Questions', 'Insincere Questions']
plt.pie(quantity, labels=labels, autopct='%1.2f%%', shadow=False)
plt.title('Target Distribution')
plt.show()


In [None]:
sincereWordcloud = WordCloud(width=800, height=450, background_color='white', min_font_size=10).generate(" ".join(sincereQuestions.question_text))
plt.figure(figsize=(16,9), facecolor=None)
plt.imshow(sincereWordcloud)
plt.axis("off")
plt.title("Common Words in Sincere Questions", fontsize=30,color='k')
plt.tight_layout(pad=0)
plt.show();

In [None]:
insincereWordcloud = WordCloud(width=800, height=450, background_color='white', min_font_size=10).generate(" ".join(insincereQuestions.question_text))
plt.figure(figsize=(16,9), facecolor=None)
plt.imshow(insincereWordcloud)
plt.axis("off")
plt.title("Common Words in Insincere Questions", fontsize=30,color='k')
plt.tight_layout(pad=0)
plt.show();

In [None]:
train_ratio = 0.8
valid_ratio = 1 - train_ratio
numOfTrain = int(train_ratio * (numOfSincere + numOfInsincere))
train_sen = []
val_sen = []
test_sen = []

for i in range(0, len(train_data['question_text'])):
    if i < numOfTrain:
        train_sen.append(train_data['question_text'].loc[i])
    else:
        val_sen.append(train_data['question_text'].loc[i])
        
for i in range(0, len(test_data['question_text'])):
    test_sen.append(test_data['question_text'].loc[i])

train_label = []
val_label = []
for i in range(0, len(train_data['target'])):
    if i < numOfTrain:
        train_label.append(float(train_data['target'].loc[i]))
    else:
        val_label.append(float(train_data['target'].loc[i]))
        

In [None]:
allStopWords = "a abaft abafter abaftest about abouter aboutest above abover abovest accordingly aer aest afore after afterer afterest afterward afterwards again against aid ain albeit all aller allest alls allyou almost along alongside already also although always am amid amidst among amongst an and andor anear anent another any anybody anyhow anyone anything anywhere apart aparter apartest appear appeared appearing appears appropriate appropriated appropriater appropriates appropriatest appropriating are aren aren't ares around as ases aside asides aslant astraddle astraddler astraddlest astride astrider astridest at athwart atop atween aught aughts available availabler availablest awfully b be became because become becomes becoming becominger becomingest becomings been before beforehand beforehander beforehandest behind behinds being below beneath beside besides better bettered bettering betters between betwixt beyond bist both but buts by by-and-by byandby c can cannot canst cant canted cantest canting cants cer certain certainer certainest cest chez circa co come-on come-ons comeon comeons concerning concerninger concerningest consequently considering could couldn couldn't couldst cum d dday ddays describe described describes describing despite despited despites despiting did didn didn't different differenter differentest do doe does doesn doesn't doing doings don don't done doner dones donest dos dost doth down downs downward downwarder downwardest downwards during e each eg eight either else elsewhere enough ere et etc even evened evenest evens evenser evensest ever every everybody everyone everything everywhere ex except excepted excepting excepts exes f fact facts failing failings few fewer fewest figupon figuponed figuponing figupons five followthrough for forby forbye fore forer fores forever former formerer formerest formerly formers fornenst forwhy four fourscore frae from fs further furthered furtherer furtherest furthering furthermore furthers g get gets getting go gone good got gotta gotten h had hadn hadn't hadst hae hardly has hasn hasn't hast hath have haven haven't haves having he hence her here hereafter hereafters hereby herein hereupon hers herself him himself his hither hitherer hitherest hoo hoos how how-do-you-do howbeit howdoyoudo however huh humph i idem idemer idemest ie if ifs immediate immediately immediater immediatest in inasmuch inc indeed indicate indicated indicates indicating info information insofar instead into inward inwarder inwardest inwards is isn isn't it it's its itself j just k l latter latterer latterest latterly latters layabout layabouts less lest ll lot lots lotted lotting m ma main make many mauger maugre mayest me meanwhile meanwhiles midst midsts might mightn mightn't mights more moreover most mostly much mucher muchest must musth musths mustn mustn't musts my myself n natheless nathless neath neaths necessarier necessariest necessary needn needn't neither nethe nethermost never nevertheless nigh nigher nighest nine no no-one nobodies nobody noes none noone nor nos not nothing nothings notwithstanding now nowhere nowheres o of off offest offs often oftener oftenest oh on once one oneself onest only ons onto or orer orest other others otherwise otherwiser otherwisest ought oughts our ours ourself ourselves out outed outest outs outside outwith over overall overaller overallest overalls overs own owned owning owns owt p particular particularer particularest particularly particulars per perhaps plaintiff please pleased pleases plenties plenty pro probably provide provided provides providing q qua que quite r rath rathe rather rathest re really regarding relate related relatively res respecting respectively s said saider saidest same samer sames samest sans sanserif sanserifs sanses saved sayid sayyid seem seemed seeminger seemingest seemings seems send sent senza serious seriouser seriousest seven several severaler severalest shall shalled shalling shalls shan shan't she she's should should've shoulded shoulding shouldn shouldn't shoulds since sine sines sith six so sobeit soer soest some somebody somehow someone something sometime sometimer sometimes sometimest somewhat somewhere stop stopped such summat sup supped supping sups syn syne t ten than that that'll the thee their theirs them themselves then thence thener thenest there thereafter thereby therefore therein therer therest thereupon these they thine thing things this thises thorough thorougher thoroughest thoroughly those thou though thous thouses three thro through througher throughest throughout thru thruer thruest thus thy thyself till tilled tilling tills to together too toward towarder towardest towards two u umpteen under underneath unless unlike unliker unlikest until unto up upon uponed uponing upons upped upping ups us use used usedest username usually v various variouser variousest ve verier veriest versus very via vis-a-vis vis-a-viser vis-a-visest viz vs w was wasn wasn't wast we were weren weren't wert what whatever whateverer whateverest whatsoever whatsoeverer whatsoeverest wheen when whenas whence whencesoever whenever whensoever where whereafter whereas whereby wherefrom wherein whereinto whereof whereon wheresoever whereto whereupon wherever wherewith wherewithal whether which whichever whichsoever while whiles whilst whither whithersoever who whoever whom whomever whose whoso whosoever why will with withal within without won won't would woulded woulding wouldn wouldn't woulds x y ye yet yon yond yonder you you'd you'll you're you've your yours yourself yourselves z zillion"
stopWords = allStopWords.split(" ")
def removeStop(sentence):
    words = sentence.split(" ")
    ans = ""
    for s in words:
        try: 
            index = stopWords.index(s)
        except:
            ans += (s + " ")
    return ans

In [None]:
for i in range(0, len(train_sen)):
    if not isinstance(train_sen[i], str):
        train_sen[i] = str(train_sen[i])
    train_sen[i] = removeStop(train_sen[i])

for i in range(0, len(val_sen)):
    if not isinstance(val_sen[i], str):
        val_sen[i] = str(val_sen[i])
    val_sen[i] = removeStop(val_sen[i])

for i in range(0, len(test_sen)):
    if not isinstance(test_sen[i], str):
        test_sen[i] = str(test_sen[i])
    test_sen[i] = removeStop(test_sen[i])

In [None]:
vocab_size = 30000
embedding_dim = 64
max_length = 64
trunc_type = 'post'
pad_type = 'post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sen)

train_seq = tokenizer.texts_to_sequences(train_sen)
train_pad = pad_sequences(train_seq,maxlen=max_length, padding=pad_type, truncating=trunc_type)

In [None]:
val_seq = tokenizer.texts_to_sequences(val_sen)
val_pad = pad_sequences(val_seq,maxlen=max_length, padding=pad_type, truncating=trunc_type)

test_seq = tokenizer.texts_to_sequences(test_sen)
test_pad = pad_sequences(test_seq,maxlen=max_length, padding=pad_type, truncating=trunc_type)

input1 = keras.Input(len(train_pad[0]))

embedded_vector = layers.Embedding(vocab_size, embedding_dim)(input1)

hidden_class_1 = layers.LSTM(64)(embedded_vector)
hidden_class_1 = keras.Model(inputs=input1, outputs=hidden_class_1)

hidden_class = hidden_class_1.output
output = layers.Dense(2, activation='softmax')(hidden_class)

model = keras.Model(inputs=[hidden_class_1.input], outputs=output)

model.summary()
train_label = np.array(train_label)
val_label = np.array(val_label)

In [None]:
loss_function = keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='Adam', loss=loss_function, metrics=['accuracy'])
history = model.fit(x=[train_pad], y=train_label, batch_size=32, epochs=5)


In [None]:
test_recall, test_acc = model.evaluate(x=[val_pad], y=val_label, verbose=1)

In [None]:
pred_y = model.predict([test_pad], batch_size=1024, verbose=1)
ans = []
for prediction in pred_y:
    ans.append((prediction[1]*2).astype(int))
    

In [None]:
data_submit = pd.DataFrame({"qid":test_data["qid"].values})
data_submit['prediction'] = ans
data_submit.to_csv("submission.csv", index=False)
data_submit

In [None]:
print(ans.count(1))