In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [2]:
%time !unzip /kaggle/input/quora-insincere-questions-classification/embeddings.zip

Archive:  /kaggle/input/quora-insincere-questions-classification/embeddings.zip
   creating: GoogleNews-vectors-negative300/
   creating: glove.840B.300d/
   creating: paragram_300_sl999/
   creating: wiki-news-300d-1M/
  inflating: glove.840B.300d/glove.840B.300d.txt  
  inflating: GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  inflating: wiki-news-300d-1M/wiki-news-300d-1M.vec  
  inflating: paragram_300_sl999/README.txt  
  inflating: paragram_300_sl999/paragram_300_sl999.txt  
CPU times: user 2.21 s, sys: 498 ms, total: 2.71 s
Wall time: 2min 59s


In [115]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import fbeta_score

import tensorflow as tf
import nltk
from nltk.corpus import stopwords
import re
import string
from keras.models import Sequential
from keras.preprocessing import text
from keras.utils import pad_sequences
from keras.layers import Embedding, Dense , SimpleRNN , Dropout , LSTM , Bidirectional
from tensorflow.keras.optimizers.schedules import PolynomialDecay

import plotly.express as px
from tqdm._tqdm_notebook import tqdm_notebook as tqdm

In [4]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [5]:
def clean_text(text):
#    Make the test lowercase
    text = text.lower()
    
#     Remove Text in Square Brackets
    text = re.sub("\[.*\]",'',text)
#     print("text after removing characters in square brackets is  ",text)
#     Remove links
    text = re.sub("https?://\S+|www\.\S+",'',text)
    text = re.sub('<.*?>+', '', text)
#     Remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
#     print("text after removing punc  is ",text)
#     remove new line
    text = re.sub('\n','',text)
#     Remove words containing numbers
    text = re.sub('\w*\d\w*','',text)
#     Replace double spaces with single space
    text = re.sub('\s+'," ",text)
    return text

stemmer = nltk.SnowballStemmer("english")
def stemm_text(text):
    text = ' '.join([stemmer.stem(word) for word in text.split(" ")])
    return text


# Removing Stop Words
stopwords_list = stopwords.words('english')
def remove_stop_words(text):
    text = ' '.join([word  for word in text.split(" ") if word not in stopwords_list]  )
    return text

def preprocess_text(text):
    text = clean_text(text)
#     text = remove_stop_words(text)
#     text = stemm_text(text)
    return text


In [None]:
def get_word_embed_from_glove(file_path):
    w2v_map = {}
    with open(file_path,'r',encoding='UTF-8') as f :
        for line in f :
            line_values = line.split(" ")
            curr_word = line_values[0]
            curr_word_embed = line_values[1:]
            w2v_map[curr_word] = np.array(curr_word_embed  ,dtype=np.float64)
    return w2v_map

In [None]:
%time w2v_map = get_word_embed_from_glove("glove.840B.300d/glove.840B.300d.txt")

In [30]:
train_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
test_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")

In [16]:
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
white_list = string.ascii_letters + string.digits + latin_similar + ' '
white_list += "'"
white_list

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ '"

In [17]:
glove_chars = ''.join([c for c in tqdm(w2v_map) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
glove_symbols

  0%|          | 0/2196016 [00:00<?, ?it/s]

',.":)(-!?|;$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′█½…“★”–●►−¢²¬░¡¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■▀¨▄♫☆¯♦¤▲¸¾⋅∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤¹≤‡√◄━⇒▶º≥╝♡◊。✈≡☺✔↵≈✓♣☎℃◦└‟～！○◆№♠▌✿▸⁄□❖✦．÷｜┃／￥╠↩✭▐☼µ☻┐├«∼┌℉☮฿≦♬✧〉－⌂✖･◕※‖◀‰\x97↺∆┘┬╬،⌘⊂ª＞〈⎙Å？☠⇐▫∗∈≠♀ƒ♔˚℗┗＊┼❀＆∩♂‿∑‣➜┛⇓☯⊖☀┳；∇⇑✰◇♯☞´↔┏｡◘∂✌♭┣┴┓✨\xa0ˈ˜❥┫℠✒［∫\x93≧］\x94∀♛\x96∨◎ˑ↻⅓⇩＜≫✩ˆ✪♕؟₤☛╮␊＋┈ɡ％╋▽⇨┻⊗￡।▂✯▇＿➤₂✞＝▷△◙▅✝ﾟ∧␉☭┊╯☾➔∴\x92▃↳＾׳➢╭➡＠⊙☢˝⅛∏„①๑∥❝☐▆╱⋙๏☁⇔▔\x91②➚◡╰٠\x85♢˙۞✘✮☑⋆ℓⓘ❒☣✉⌊➠∣❑⅔◢ⓒ\x80〒∕▮⦿✫✚⋯♩☂ˌ❞‗܂☜‾✜╲∘⟩＼⟨·⅜✗♚∅ⓔ◣͡‛❦⑨③◠✄❄１∃␣≪｢≅◯☽２∎｣⁰❧̅ǡⒶ↘⚓▣˘∪⇢✍⊥＃⅝⎯↠۩☰◥⊆✽ﬁ⚡↪ở❁☹◼☃◤❏ⓢ⊱α➝̣✡∠｀▴┤Ȃ∝♏ⓐ✎;３④␤＇❣⅞✂✤ⓞ☪✴⌒˛♒＄ɪ✶▻Ⓔ◌◈۲Ʈ❚ʿ❂￦◉╜̃ν✱╖❉₃ⓡℝ٤↗❶ʡ۰ˇⓣ♻➽۶₁ʃ׀✲ʤ✬☉▉≒☥⌐♨✕ⓝ⊰❘＂⇧̵➪４▁β۱▏⊃ⓛ‚♰́✏⏑̶٩Ⓢー⩾日￠❍≃⋰♋ɿ､̂❋✳ⓤ╤▕⌣✸℮⁺▨⑤╨Ⓥ♈❃☝５✻⊇≻♘♞◂７✟⌠✠☚✥❊ƂⒸ⌈❅Ⓡ♧Ⓞɑλ۵▭❱Ⓣ∟☕♺∵⍝ⓑɔ✵✣ℤ年ℕ٭♆Ⓘⅆ∶⚜◞்✹Ǥȡ➥ᴥ↕ɂ̳∷✋➧∋̿ͧʘ┅⥤⬆ǀμ₄⋱ʔ☄↖⋮۔♌Ⓛ╕♓ـ⁴❯♍▋✺⭐６✾♊➣▿Ⓑ♉Ａ⏠◾▹⑥⩽в↦╥⍵⌋։➨и∮⇥ⓗⒹ⁻ʊ⎝⌥⌉◔◑ǂ✼♎ℂ♐╪ɨ⊚☒⇤θВⓜ⎠Ｏ◐ǰ⚠╞ﬂ◗⎕ⓨ☟Ｉⓟ♟❈↬ⓓ◻♮❙а♤∉؛⁂例Ⓝ־♑╫╓╳⬅☔πɒɹ߂☸ɐʻ┄╧ʌ׃８ʒ⎢❆⋄⚫̏☏➞͂␙Ⓤ◟Ƥʕ̊Ȥ⚐✙は↙̾ωΔ℘ﾞ✷⑦φ⍺❌⊢▵✅ｗ９ⓖ☨▰ʹ╡Ⓜ☤∽╘˹↨ȿ♙⬇♱⌡Ω⠀╛❕┉Ⓟ̀Ǩ♖ⓚ┆⑧⎜ǹ◜⚾⤴✇╟⎛☩➲➟ⓥⒽ⏝◃０₀╢月↯✆˃⍴❇⚽╒Ｃɻɤ̸♜☓Ｔ➳⇄γ☬⚑✐⁵δȭ⌃◅▢ｓȸ❐∊☈ⅇℜ॥σ⎮ȣ▩のτεＳு⊹‵␔☊➸̌☿⇉➊⊳╙⁶ⓦ⇣｛̄↝⎟ℳ▍❗ℑＭɾｍ״Γ΄▞◁⛄⇝⎪ˤ♁ｖ⇠☇✊位ℒạி｝๐⭕➘Ｂ❺ɸˡ⁀⑩ｃ⅕Ƽ۳☙❛₆ƪ❓⟲Ʒ⇀≲Ｐ❷١ⓕ⎥Ｄс\u06ddǥͤ₋̱̎♝≳▙Ｒʹ➭ℰ܀ʺȫ

In [19]:
quora_chars = build_vocab(list(train_df["question_text"]))
quora_symbols = ''.join([c for c in quora_chars if not c in white_list])
quora_symbols

  0%|          | 0/1306122 [00:00<?, ?it/s]

'?,./-()"$=…*&+′[ɾ̃]ɖ%:^\xa0\\{}–“”ऋॠऌॡ\u200b;!谢六佬<`मिलगईकेजोठंड®ạệ°#صور²|~₹√_α→>—£啧，您这口味奇特也就罢了非要以此为依据对人家批判一番不地道啊。️´×@π÷\ufeff？ʿ€有毒黎氏玉英欢迎入坑へのとも↑∞ʻ℅нмѕтвυι•−Еелядурак新年快乐！学业进步身体健康们读我的翻译篇章لكمقا∈∩⊆अनुस्वारत§℃標點符號\xadθ±≤उदयबट͜͡ʖ⁴™спибог≠∂आहभी³च能化生水谷精微ฉันจะทำให้ดี่สุΠपऊ请国知识产权局خبஜோடி大「寧可錯殺千絕放過」之勢ẽ½△¿북한ЬЪ\u202a¼∆≥⇒¬∨∫▾Ω＾江青γ鸡汤文粵拼µ\u202cशथझौूओºष♭封柏荣़ルージュ⃗̂ɔ∑\u2061রড়ঢ়સંઘરાજ્યεντσ配信開始日商品発売分子Ψ创意梦工坊کپڤ蘭花羡慕和嫉妒是样ןΓ∪φψ⊨βैごめなさいすみません音红宝书ستطيعةد支那¹マリ仲直りした主席\u2060血∘¨″⅓ढ漢髪金茶訓読黒あわるɑː胡南✅✓수능（广电总）κί서로가를행복하게기乡故∠«»爾汝言حஆய்தஎழு得理让λ∧∀骂،＝野比び太ɦʏɨʀօʍռքʋ兰ϵδɒ後宮甄嬛傳ᠠᡳ᠌ᠰᠨᡤᡠᡵếधड़¸ч☹ễộफμ做莫你酱紫Δ내제ʃɸợ甲骨陳宗陈什么说ㅜΟςοηΣण伊藤長₅₆ﷺ僕だけが街◦火团表·औछए看他顺眼中華民國فنḵПьВΦỵ許自東☺ℇ❤♨✌儿臣惶恐っ木ホج⧼⧽মহাবিশ্পতনয়সচছেষয়টউথকῥζὤ教官국고등학교는몇시간업니까ಸ್ವರಕಷಗಳು本語上手でねἈχῆύኤልሮኢየኝንአሁ台湾最美风景≡皎滢杨\u200e∛簡訊短送發お早う朝شه饭乱吃话讲ấ눈치男女授受亲یМюʌʊיהודת好心没报கவாம≈⁰⁷攻克ख禮儀統已經失存٨八‛字：别高兴还几个条件呢ਨਾਮੁ觀《》ﬁて„宋楚瑜孫瀛枚无挑剔¾ЯхОз聖部頭合約ρ⌚⟨⟩∖˂油腻邋遢ٌ射籍贯ό老常谈ⁿ⅔నీకెందుాగరిచ族伟复平天下悠堵阻≅ϕ‑愛\x7f过会ả￼ֿかくれ－俄罗斯茹西亚싱관없어나이키夢彩蛋︠︡鰹節狐狸鳳凰露ἰή＞ξ王晓菲ணபளன恋に落ちらよ悲反清復明肉希望沒公病តើបង្អូនមាធយវីខលះដរកឃញឯសំពិៃទគ¢つや記คณกลงอไร㏒㏑⁸구경용方记账数与贵致核误ʒ감겨드려유أضئزىघはじゅ眞선배

In [23]:
symbols_to_delete = ''.join([c for c in quora_symbols if not c in glove_symbols])
symbols_to_delete

'ɖऋॠऌॡ\u200b谢六佬मिलगईकेजोठंडệصور₹啧您这口味奇特也就罢了非要以此为依据对人家批判一番不地道啊️\ufeff有毒黎氏玉英欢迎入坑へともнмѕтυιЕелядурк新快乐学业进步身体健康们读我的翻译篇章لكمقاअनुस्वारत標點符號\xadउदयबट͜ʖпбогआहभीच能化生水谷精微ฉันจะทำให้ดี่สุΠपऊ请国知识产权局خبஜோட大「寧可錯殺千絕放過」之勢ẽ북한ЬЪ\u202a江青鸡汤文粵拼\u202cशथझौूओष封柏荣़ルジュ⃗\u2061রড়ঢ়સંઘરાજ્ય配信開始商品発売分子Ψ创意梦工坊کپڤ蘭花羡慕和嫉妒是样ןψ⊨ैごめなさいすみません音红宝书ستطيعةد支那マリ仲直りした主席\u2060血ढ漢髪金茶訓読黒あわる胡南수능广电总κί서로가를행복하게기乡故爾汝言حஆயதஎழ得理让骂野比び太ɦʏʀօʍռք兰ϵ後宮甄嬛傳ᠠᡳ᠌ᠰᠨᡤᡠᡵếधड़чễộफ做莫你酱紫내제ợ甲骨陳宗陈什么说ㅜΟςοण伊藤長ﷺ僕だけが街火团表औछए看他顺眼中華民國فنḵПьỵ許自東ℇ儿臣惶恐っ木ホج⧼⧽মহাবিশ্পতনয়সচছেষয়টউথকῥζὤ教官국고등학교는몇시간업니까ಸ್ವರಕಷಗಳು本語上手でねἈχῆύኤልሮኢየኝንአሁ台湾最美风景皎滢杨\u200e∛簡訊短送發お早う朝شه饭乱吃话讲ấ눈치男女授受亲یМюיהודת好心没报கவம⁷攻克ख禮儀統已經失存٨八字别高兴还几个条件呢ਨਾਮੁ觀《》て宋楚瑜孫瀛枚无挑剔ЯхОз聖部頭合約⌚∖油腻邋遢ٌ射籍贯ό老常谈ⁿనీకెందుాగరిచ族伟复平天下悠堵阻ϕ‑愛\x7f过会ả￼ֿかくれ俄罗斯茹西亚싱관없어나이키夢彩蛋︠︡鰹節狐狸鳳凰露ἰήξ王晓菲ணபளன恋に落ちらよ悲反清復明肉希望沒公病តើបង្អូនមាធយវីខលះដរកឃញឯសំពិៃទគつや記คณกลงอไร㏒㏑구경용方记账数与贵致核误감겨드려유أضئزىघじゅ眞선배님ủậờố్ల虞蝴蝶篆隶小煮立沸騰害怕毛悚然ṛ实名买票制Ʃௌரइ唐樂えநஉறைசଭଜାכבםנא败胜队Кыйщ正弦歪果仁研究协河湖ữ謝汽车ṃ₱どぜき独善ゆそ唯尊雾草殤飯糰ृứ杯懸命פרΝ포술刀এখঅধীদইওো陆同胞问题只针外友次统计并裂图来坚持原则灣如達願傾聽但堅個則決對獨論ῤСШㄹ性ਲੜ镇観世菩薩ぼ脚踏\n母词屠城ъʰ五党军२機嫌悪オラ出話今冗談笑気ゃ何聞メエ苦痛

In [25]:
symbols_to_isolate = ''.join([c for c in quora_symbols if c in glove_symbols])
symbols_to_isolate

'?,./-()"$=…*&+′[ɾ̃]%:^\xa0\\{}–“”;!<`®ạ°#²|~√_α→>—£，。´×@π÷？ʿ€の↑∞ʻ℅в•−а年！∈∩⊆§℃θ±≤͡⁴™си≠∂³ி½△¿¼∆≥⇒¬∨∫▾Ω＾γµº♭ー̂ɔ∑εντσ日Γ∪φβ¹∘¨″⅓ɑː✅✓（）∠«»்ுλ∧∀،＝ɨʋδɒ¸☹μΔʃɸηΣ₅₆◦·ВΦ☺❤♨✌≡ʌʊா≈⁰‛：ﬁ„¾ρ⟨⟩˂⅔≅－＞¢⁸ʒは⬇♀؟¡⋅ɪ₁₂ɤ◌ʱ、▒ْ；☉＄∴✏ωɹ̅।ـ☝♏̉̄♡₄∼́̀⁶⁵¦¶ƒˆ‰©¥∅・ﾟ⊥ª†ℕ│ɡ∝♣／☁✔❓∗➡ℝ位⎛⎝¯⎞⎠↓ɐ∇⋯˚⁻ˈ₃⊂˜̸̵̶̷̴̡̲̳̱̪̗̣̖̎̿͂̓̑̐̌̾̊̕\x92'

In [26]:
isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

In [32]:
train_df["clean_text"] = train_df['question_text'].apply(lambda x:handle_punctuation(x))
test_df['clean_text'] = test_df['question_text'].apply(lambda x:handle_punctuation(x))

In [40]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

train_df['clean_text'] = train_df['clean_text'].apply(lambda x:handle_contractions(x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x:handle_contractions(x))

In [48]:
def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

train_df['clean_text'] = train_df['clean_text'].apply(lambda x:fix_quote(x.split()))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x:fix_quote(x.split()))

In [5]:
# %time train_df['clean_text']=train_df['question_text'].apply(preprocess_text)
# train_1df = train_df[train_df['target']==1]
# train_0df = train_df[train_df['target']==0].sample(100000)
# train_df = pd.concat([train_1df,train_0df],axis=0)

CPU times: user 47 s, sys: 84.2 ms, total: 47.1 s
Wall time: 47.1 s


In [6]:
# %time test_df['clean_text']=test_df['question_text'].apply(preprocess_text)

CPU times: user 13.5 s, sys: 28.9 ms, total: 13.5 s
Wall time: 13.5 s


In [50]:
test_df.shape,train_df.shape,train_df['qid'].nunique()

((375806, 3), (1306122, 4), 1306122)

In [51]:
X = train_df.drop(['target'],axis=1)
y = train_df['target']

x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,stratify=y)
x_train.shape,x_valid.shape,y_train.shape,y_valid.shape

((1044897, 3), (261225, 3), (1044897,), (261225,))

# Text Preprocessing

In [52]:
text_feat= "clean_text"

tokenizer = text.Tokenizer()
max_len = 50

tokenizer.fit_on_texts(list(x_train[text_feat])+list(x_valid[text_feat]))
x_train_seq = tokenizer.texts_to_sequences(x_train[text_feat])
x_valid_seq = tokenizer.texts_to_sequences(x_valid[text_feat])

x_train_pad = pad_sequences(x_train_seq,maxlen=max_len)
x_valid_pad = pad_sequences(x_valid_seq,maxlen=max_len)

word_index = tokenizer.word_index

In [57]:
words_without_embedding = []
embedding_matrix = np.zeros((len(word_index)+1,len(w2v_map['moon'])))
for word, index in word_index.items():
    w2v_embedding = w2v_map.get(word)
    if w2v_embedding is not None : 
        embedding_matrix[index,:] = w2v_embedding       
    else :
        words_without_embedding.append(word)
#         and len(w2v_embedding)==len(w2v_map['moon'])

In [58]:
len(words_without_embedding)/len(word_index)

0.3584614116774486

In [131]:
vocab = build_vocab(list(x_train['clean_text'].apply(lambda x : x.split(" "))))
oov = check_coverage(vocab,w2v_map)
oov[20:30]

  0%|          | 0/1044897 [00:00<?, ?it/s]

  0%|          | 0/210118 [00:00<?, ?it/s]

Found embeddings for 77.59% of vocab
Found embeddings for  99.54% of all text


[("Qur'an", 57),
 ('Upwork', 53),
 ('Zerodha', 51),
 ('Doklam', 51),
 ('LNMIIT', 50),
 ('MUOET', 49),
 ('bhakts', 49),
 ('Kavalireddi', 49),
 ('NICMAR', 46),
 ('Vajiram', 46)]

# with LSTM

In [154]:
model = Sequential()
model.add(Embedding(len(word_index)+1, 300 ,input_length= max_len,weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(rate=0.1))
model.add(Dense(1,activation='sigmoid'))

In [155]:
batch_size = 64*8
no_of_batches = (len(x_train)//(64*8))+1
epochs = 3
num_train_steps =  no_of_batches* epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=1e-3, end_learning_rate=5e-5, decay_steps=num_train_steps
)

adam = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

model.compile(optimizer=adam,loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train_pad,y_train, epochs=epochs,batch_size= batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f953e1c20b0>

In [156]:
y_proba = model.predict(x_valid_pad)
y_proba = y_proba.reshape(len(y_proba),)
y_proba



array([0.00899371, 0.00087061, 0.00017448, ..., 0.0009018 , 0.06950182,
       0.00025556], dtype=float32)

In [157]:
thresholds = np.linspace(0,1,num=101)

f1_scores = []
for thres in tqdm(thresholds) :
    y_pred = [1 if val > thres else 0  for val in y_proba   ]
    f1_score = fbeta_score(y_valid,y_pred,beta=1)
    f1_scores.append(f1_score)

    
df = pd.DataFrame()
df['thres'] = thresholds
df['f1_scores'] = f1_scores
px.line(df,x='thres',y='f1_scores')

  0%|          | 0/101 [00:00<?, ?it/s]

In [158]:
best_proba_threshold = thresholds[np.argmax(f1_scores)]
best_proba_threshold,np.max(f1_scores)

(0.34, 0.6694969467868567)

In [159]:
x_test_seq = tokenizer.texts_to_sequences(test_df["clean_text"])
x_test_pad = pad_sequences(x_test_seq,maxlen=max_len)
y_test_proba = model.predict(x_test_pad)
y_test_proba = y_test_proba.reshape(len(y_test_proba),1)
y_test_pred = [1 if val > best_proba_threshold else 0 for val in y_test_proba]



In [160]:
np.sum(y_test_pred)/len(y_test_pred)

0.0689557910198347

In [161]:
sub = test_df.copy()
sub['prediction']=np.array(y_test_pred)
sub.drop(["question_text","clean_text"],axis=1,inplace=True)
sub.head()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0


In [162]:
sub.to_csv("submission.csv",index=False)