# Refactoring

This is a refactory and clean up of the [previous kernel](https://www.kaggle.com/marcocarnini/a-minimalistic-approach).

## Read Input

In [None]:
import pandas as pd
import numpy as np

def get_train(howManyLines):
    train = pd.read_csv("../input/train.csv", nrows=howManyLines)
    print("Rows read: ", howManyLines)
    print("Insincere ratio: ", np.mean(train.target))
    return train
    
train = get_train(10000)

## Get embedding

In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

def get_embedding(embeddingFile):
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embeddingFile))
    print("Loaded keys: ", len(set(embeddings_index.keys())))
    return embeddings_index

embeddings_index = get_embedding('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
embedding_size = len(embeddings_index["The"])

## Get vocabulary

In [None]:
def get_sentences_list(questions):
    return [[j for j in i\
             .replace("?", " ? ")\
             .replace("…", " … ")\
             .replace("/", " / ")\
             .replace(":", " : ")\
             .replace(",", " , ")\
             .replace("(", " ( ")\
             .replace(")", " ) ")\
             .replace("%", " % ")\
             .replace("$", " $ ")\
             .replace("'s", " 's ")\
             .replace("’s", " ’s ")\
             .replace("!!!", " !!! ")\
             .replace("I’m", " I'm ")\
             .replace('"The', " The ")\
             .replace('don’t', " con't ")\
             .replace("What's", " What is ")\
             .replace(" so, ", " so , ")\
             .replace(" you've ", " you have ")\
             .replace(" aren't ", " are not ")\
             .replace(" won't ", " will not ")\
             .replace(" me, ", " me , ")\
             .replace(" Isn't ", " Is not ")\
             .replace(" they're ", " they are ")\
             .replace("haven't", ' have not ')\
             .replace("can’t", " can't ")\
             .replace("isn't", " is not ")\
             .replace("con't", " can't ")\
             .replace("India,", " India , ")\
             .replace("win,", " win , ")\
             .replace("life,", " life , ")\
             .replace("time,", " time , ")\
             .replace("better,", " better , ")\
             .replace("math]", " math ] ")\
             .replace("people,", " people, ")\
             .replace('doesn’t', " doesn't ")\
             .replace("shouldn't", " should not ")\
             .replace("them,", " them , ")\
             .replace('"I', ' " I ')\
             .replace("shouldn't", " should not ")\
             .replace("shouldn't", " should not ")\
             .replace("Isn't", " Is not ")\
             .replace("years,", " years , ")\
             .replace("wasn't", " was not ")\
             .replace("couldn't", " could not ")\
             .replace("wouldn't", " would not ")\
             .replace("Quora,", " Quora , ")\
             .replace("I’ve", " I've ")\
             .replace("world,", " world , ")\
             .replace("now,", " now , ")\
             .replace("US,", " US , ")\
             .replace("country,", " country , ")\
             .replace("person,", " person , ")\
             .replace('you’ve', " you have ")\
             .replace('didn’t', " didn't ")\
             .replace('do,', " do , ")\
             .replace('fight,', " fight , ")\
             .replace('day,', " day , ")\
             .replace('"the', ' " the ')\
             .replace('school,', ' school , ')\
             .replace('China,', ' China , ')\
             .replace("Shouldn't", " Should not ")\
             .replace("Trump,", " Trump , ")\
             .replace("year,", " year , ")\
             .replace("him,", " him , ")\
             .replace("Also,", " Also , ")\
             .replace("isn’t", " is not ")\
             .replace("work,", " work , ")\
             .replace("today,", " today , ")\
             .replace("that,", " that , ")\
             .replace("Wouldn't", " Would not ")\
             .replace("Aren't", " Are not ")\
             .replace("Doesn't", " Does not ")\
             .replace("money,", " money , ")\
             .replace("ago,", " ago , ")\
             .replace("say,", " say , ")\
             .replace("is,", " is , ")\
             .replace("(if", " ( if ")\
             .replace("up,", " up , ")\
             .replace("we're", " we are ")\
             .replace("Can't", " Can not ")\
             .replace("weren't", " were not ")\
             .replace('won’t', ' will not ')\
             .replace("aren’t", " are not ")\
             .replace("2017.", " 2017 . ")\
             .replace('Quorans', ' Quoran ')\
             .replace("hasn't", ' has not ')\
             .replace("cryptocurrencies", ' cryptocurrency ')\
             .replace("Brexit", ' British exit ')\
             .replace("Redmi", "Xiaomi ")\
             .replace("they’re", " they are ")\
             .replace("you’re", " you're ")\
             .replace("they've", " they have")\
             .replace('"', ' " ')\
             .replace('C#', ' C++ ')\
             .replace('Isn’t', ' Is not ')\
             .replace('haven’t', ' have not ')\
             .replace("'The", " ' The ")\
             .replace("they've", " they have")\
             .replace('"', ' " ')\
             .replace('Isn’t', ' Is not ')\
             .replace('haven’t', ' have not ')\
             .replace("'The", " ' The ")\
             .replace("hadn't", " had not ")\
             .replace("2018.", " 2018 . ")\
             .replace("^2", " squared ")\
             .replace("OnePlus", " Chinese Smartphone ")\
             .replace('"what', ' " what')\
             .replace('UCEED', ' CEED ')\
             .replace('engineering.', ' engineering .')\
             .replace('wouldn’t', ' would not ')\
             .replace("We're", "We are")\
             .replace('it"', ' it " ')\
             .replace("wasn’t", "was not")\
             .replace("GDPR", " General Data Protection Regulation ")\
             .replace("we've", " we have ")\
             .replace("Blockchain", " blockchain ")\
             .replace('"Why', ' " Why ')\
             .replace('Coinbase', ' cryptocurrency ')\
             .replace('shouldn’t', ' should not ')\
             .replace('Adityanath', ' yogi ')\
             .replace('Don’t', " Don't ")\
             .replace('Machedo', " Strategist Influencer ")\
             .replace('BNBR', " Be Nice, Be Respectful ")\
             .replace("'I", " ' I ")\
             .replace("'the", " ' the ")\
             .replace('“The', ' " The ')\
             .replace('Boruto', ' Naruto ')\
             .replace('"You', ' " You ')\
             .replace("would've", " would have ")\
             .replace("You're", " You are ")\
             .replace("ethereum", " cryptocurrency ")\
             .replace("ethereum", " cryptocurrency ")\
             .replace("DCEU", " DC Extended Universe ")\
             .replace('"in', ' " in ')\
             .replace('“I', ' " I')\
             .replace('"to"', ' " to')\
             .replace('IIEST', "Indian Institute of Engineering Science and Technology")\
             .replace('couldn’t', ' could not')\
             .replace("they'll", " they will ")\
             .replace("Wasn't", " Was not ")\
             .replace('"My', ' " My')\
             .replace('Qoura', ' Quora ')\
             .replace('up"', ' up " ')\
             .replace('10+2', ' 10 + 2 ')\
             .replace('"no', ' " no ')\
             .replace("parents'", " parents 's ")\
             .replace("SJWs", " SJW ")\
             .replace('life"', ' life " ')\
             .replace('\\frac"', ' fraction ')\
             .replace('"not', ' not ')\
             .replace("Qur'an", "Quran")\
             .replace('"When', ' " When ')\
             .replace('people"', ' people " ')\
             .replace('depression.', ' depression . ')\
             .replace('..', ' ... ')\
             .replace('x+1', ' x + 1 ')\
             .replace('we’re', ' we are ')\
             .replace('"good', ' " good ')\
             .replace('Trump.', ' Trump . ')\
             .replace('to"', ' to " ')\
             .replace("he'll", " he will ")\
             .replace('time"', ' time "')\
             .replace('"No', ' " No')\
             .replace('10+', ' 10 +')\
             .replace('LNMIIT', ' Institute Of Information Technology ')\
             .replace("Won't", ' Will not ')\
             .replace("Upwork", ' Freelancing Platform ')\
             .replace('out"', ' out " ')\
             .replace("We've", ' We have ')\
             .replace('"We', ' " We ')\
             .replace('anti-Trump', ' anti Trump ')\
             .replace('Kavalireddi', ' Market Expert ')\
             .replace('C#', ' dot net ')\
             .replace('.net', ' dot net ')\
             .replace('.NET', ' dot net ')\
             .replace('"This', ' " This ')\
             .replace('I’ll', "I'll")\
             .replace('Zerodha', " discount broker ")\
             .replace('university.', " university . ")\
             .replace("They're", " They are ")\
             .replace('world"', ' world " ')\
             .replace('bhakts', ' bhakt ')\
             .replace("you'd", "you could")\
             .replace("Quora.", " Quora .")\
             .replace("Smartphone", " smartphone ")\
             .replace('hasn’t', " has not ")\
             .replace('demonetisation', " demonetization ")\
             .replace('Nice,', " Nice , ")\
             .replace('Strategist', " strategist ")\
             .replace("Didn't", " Did not ")\
             .replace("alt right", " Alt Right ")\
             .replace("\\frac", " fraction ")\
             .replace("Freelancing", " freelancing ")\
             .replace("A+", " A + ")\
             .replace("100+", " 100 + ")\
             .replace("Wouldn’t", " Would not ")\
             .replace("Unacademy", " Indian learning platform ")\
             .replace("anxiety.", " anxiety . ")\
             .replace("Vajiram", " Indian institute ")\
             .replace("Doklam", " disputed territory ")\
             .replace("Delhi.", " Dehli . ")\
             .replace("NICMAR", " Indian National Institute of Construction Management and Research ")\
             .replace("Couldn't", " Could not ")\
             .replace("engineer.", " engineer . ")\
             .replace("I’d", " I'd ")\
             .replace("others'", " others 's ")\
             .replace("countries'", " countries 's ")\
             .replace('Shouldn’t', 'Should not')\
             .replace('chsl', 'Combined Higher Secondary Level')\
             .replace('AlShamsi', ' Investor on Quora ')\
             .replace("they'd", "they would")\
             .replace("HackerRank", "Hacker Rank")\
             .replace("visa.", " visa . ")\
             .replace("^3", " ^ 3 ")\
             .replace("weren’t", " were not ")\
             .replace("Doesn’t", " Does not ")\
             .replace("MUOET", " Manipal University Online Entrance Test ")\
             .replace("Help!", " Help ! ")\
             .replace("Awdhesh", " Indian writer on Quora ")\
             .replace("“the", ' " the ')\
             .replace("A2A'd", ' asked to answer ')\
             .replace("Litecoin", ' cryptocurrency ')\
             .replace("suicide.", ' suicide . ')\
             .replace("eLitmus", ' Indian recruiter company ')\
             .replace("Jiren", ' Dragonball character ')\
             .replace("it'll", ' it will ')\
             .replace("Bhakts", ' bhakt ')\
             .replace("he'd", ' he would ')\
             .replace("abroad.", ' abroad . ')\
             .replace("Cryptocurrency", ' cryptocurrency ')\
             .replace("b.SC", ' BSc ')\
             .replace("could've", ' could have ')\
             .replace("#1", ' # 1 ')\
             .replace("Ryzen", ' AMD processors')\
             .replace("[\\", ' [ \\ ')\
             .replace("+y", ' + y ')\
             .replace("=1", ' = 1 ')\
             .replace("Jesus'", " Jesus ' ")\
             .replace("[\\", ' [ \\ ')\
             .replace("+y", ' + y ')\
             .replace("=1", ' = 1 ')\
             .replace("Jesus'", " Jesus ' ")\
             .replace("5'4", " 5 inches 4 ")\
             .replace("you'", " you ' ")\
             .replace("altcoins", " cryptocurrency ")\
             .replace("altcoin ", " cryptocurrency ")\
             .replace("Mumbai.", " Mumbai . ")\
             .replace("developer.", " developer . ")\
             .replace("2+", " 2 + ")\
             .replace("disorder.", " disorder . ")\
             .replace('they’ve', " they have ")\
             .replace('Baahubali', " Indian epic action film ")\
             .replace('-x', " - x ")\
             .replace('1+x', " 1 + x ")\
             .replace("US'", " US ' ")\
             .replace("r-aping", " raping ")\
             .replace("SRMJEE", " Indian university level entrance test ")\
             .replace("=0", " = 0 ")\
             .replace("B+", " B + ")\
             .replace("SGSITS", " Indian public engineering institution ")\
             .replace("B+", " B + ")\
             .replace("Thanks!", " Thanks + ")\
             .replace("90+", " 90 + ")\
             .replace("e^", " exp ^ ")\
             .replace("MU-OET", " Indian Manipal University Online Entrance Test ")\
             .replace("Pakistan.", " Pakistan . ")\
             .replace("Beerus", ' Dragonball character ')\
             .replace("Skripal", ' Russian military intelligence ')\
             .replace("it!", ' it ! ')\
             .replace("coinbase", ' cryptocurrency ')\
             .replace("me !", ' me ! ')\
             .replace("Hons.", ' honors degree ')\
             .replace("hons.", ' honors degree ')\
             .replace("don't.", " don't. ")\
             .replace("000.", " 0 0 0 . ")\
             .replace("20+", " 20 + ")\
             .replace("{x", " { x ")\
             .replace("5+", " 5 + ")\
             .replace("1+", " 1 + ")\
             .replace("{1}", " { 1 } ")\
             .replace("you’d", " you would ")\
             .replace("we'll", " we will ")\
             .replace("ain't", " be not ")\
             .replace("BMSCE", " Indian College of Engineering ")\
             .replace("Aren’t", " Are not ")\
             .replace("you!", " you ! ")\
             .replace("a+b", " a + b ")\
             .replace("Gurugram", " Gurgaon ")\
             .replace("2016.", " 2016 . ")\
             .replace("+c", " + c ")\
             .replace("Amazon.in", " Amazon India ")\
             .replace("\\sqrt", " \\ sqrt ")\
             .replace("me!", " me ! ")\
             .replace("don't.", " don't . ")\
             .replace("3+", " 3 + ")\
             .replace('bahubali', " Indian epic action film ")\
             .replace("200+", " 200 + ")\
             .replace("What\u200b", " What ")\
             .replace("=x", " = x ")\
             .replace("Zebpay", " cryptocurrency ")\
             .replace("litecoin", " cryptocurrency ")\
             .replace("5'10", " 5 inches 10 ")\
             .replace("5'11", " 5 inches 11 ")\
             .replace("5'5", " 5 inches 5 ")\
             .replace("5'6", " 5 inches 6 ")\
             .replace("5'7", " 5 inches 7 ")\
             .replace("5'8", " 5 inches 8 ")\
             .replace("5'9", " 5 inches 9 ")\
             .replace("5'3", " 5 inches 3 ")\
             .replace("5'2", " 5 inches 2 ")\
             .replace("5'", " 5 inches ")\
             .replace("graduate.", " graduate . ")\
             .replace("Alshamsi", " investor in Quora ")\
             .replace("Ravula", " trainer on youtube ")\
             .replace("Binance", " cryptocurrency ")\
             .replace("C#", " C++ ")\
             .replace("Instagram.", " Instagram . ")\
             .replace("Golang", " Google programming language ")\
             .replace("BIPC", " university course ")\
             .replace("x^", " x ^ ")\
             .replace("^1", " ^ 1 ")\
             .replace("^n", " ^ n ")\
             .replace("You've", " You have ")\
             .replace("OBC-NCL", " Indian cast religion income ")\
             .replace("1+1", " 1 + 1 ")\
             .replace("\\to", " \\ to ")\
             .replace("graduation.", " graduation . ")\
             .replace("}[", " } [ ")\
             .replace(".I", " . I ")\
             .replace("\\lim_", " \\ limit ")\
             .replace("Swachh", " Indian campaign ")\
             .replace("MHCET", " Indian University Entrance Test ")\
             .replace("Bangalore.", " Bangalore . ")\
             .replace("who've", " who have ")\
             .replace("josaa", " Indian University Entrance Test ")\
             .replace("passport.", " passport . ")\
             .replace("'a", " ' a ")\
             .replace("\\displaystyle", " \\ display ")\
             .replace(".How", " . How ")\
             .replace("Truecaller", " mobile app ")\
             .replace("non-", " non - ")\
             .replace("studying.", " studying . ")\
             .replace("startup.", " startup . ")\
             .replace("EVM'S", " Indian Electronic Voting Machines 's ")\
             .replace("divorce.", " divorce . ")\
             .replace("50+", " 50 + ")\
             .replace("40+", " 40 + ")\
             .replace("salary.", " salary . ")\
             .replace("OBOR", " One Belt One Road ")\
             .replace("300+", " 300 + ")\
             .replace("m.SC", " MSc ")\
             .replace("Google+", " Google + ")\
             .replace("\\int", " \\ integral ")\
             .replace("upwork", " Freelancing Platform ")\
             .replace("\\right", " \\ right ")\
             .replace("Chromecast", " Google digital media player ")\
             .replace("500+", " 500 + ")\
             .replace("TensorFlow", " Google machine learning platform ")\
             .replace("n+1", " n + 1 ")\
             .replace("#MeToo", " Movement against sexual harassment ")\
             .replace("Indian.", " Indian . ")\
             .replace("60+", " 60 + ")\
             .replace("friends'", " friends 's ")\
             .replace("iisc", " Indian public institute ")\
             .replace("'to", " ' to ")\
             .replace("What're", " What are ")\
             .replace("0+", " 0 + ")\
             .replace("Muslim.", " Muslim . ")\
             .replace(".what", " . what ")\
             .replace("adhaar", " Indian identity number ")\
             .replace("Adhaar", " Indian identity number ")\
             .replace("y'all", " you all ")\
             .replace("engg.", " engineering ")\
             .replace("Islam.", " Islam . ")\
             .replace("\\left", " \\ left ")\
             .replace("+b", " + b ")\
             .replace("Pune.", " Pune . ")\
             .replace("clickbait", " attention grabbing ")\
             .replace("USICT", " Indian University ")\
             .replace("8+", " 8 + ")\
             .replace("Explain.", " Explain . ")\
             .replace("major.", " major . ")\
             .replace("PESSAT", " Indian University ")\
             .replace("demonitisation", " demonetization ")\
             .replace("Muslims.", " Muslims . ")\
             .replace("help!", " help ! ")\
             .replace("\u200b", " ")\
             .replace("Hyderabad.", " Hyderabad . ")\
             .replace("30+", " 30 + ")\
             .replace("Patreon", " membership platform ")\
             .replace("'a'", " ' a ' ")\
             .replace("maths.", " mathematics . ")\
             .replace("x[", " x [ ")\
             .replace("racist.", " racist . ")\
             .replace("dx[", " dx [")\
             .replace("Dies™", " Dies trademark ")\
             .replace("microservices", " micro services ")\
             .replace("400+", " 400 + ")\
             .replace("LGBT+", " LGBT ")\
             .replace("unacademy", " Indian learning platform ")\
             .replace("'not", " ' not ")\
             .replace("teenager.", " teenager . ")\
             .replace("pro-Trump", " pro Trump ")\
             .replace("A-", " A - ")\
             .replace("Whst", " What ")\
             .replace("Demonetization", " demonetization ")\
             .replace("y=", " y = ")\
             .replace("{1", " 1 { ")\
             .replace("{2}", " { 2 } ")\
             .replace("{n", " { n ")\
             .replace("c#", " c++ ")\
             .replace("4+", " 4 + ")\
             .replace("O+", " O + ")\
             .replace("O+", " O + ")\
             .replace("\\dfrac", " \\ fraction ")\
             .replace("would’ve", " would have ")\
             .replace("Koinex", " cryptocurrency ")\
             .replace("Alt-Right", " Alt Right ")\
             .replace("IISERs", " Indian Institute of Science Education and Research ")\
             .replace("}}", " } } ")\
             .replace("fortnite", " Fortnite ")\
             .replace("Williams'", " Williams 's ")\
             .replace("hyperloop", " Hyperloop ")\
             .replace("ReactJS", " JavaScript Library ")\
             .replace("worthless.", " worthless . ")\
             .replace("Didn’t", " Did not ")\
             .replace("Bittrex", " blockchain ")\
             .replace("LGBTQ+", " LGBTQ + ")\
             .replace("Java.", " Java . ")\
             .replace("dating.", " dating . ")\
             .replace("Byju", " learning app ")\
             .replace("“What", ' " What ')\
             .replace("MBA.", " MBA . ")\
             .replace("x+2", " x + 2 ")\
             .replace("LBSNAA", " Indian research and training institute ")\
             .replace("\\infty}", " \\ infinity } ")\
             .replace("=3", " = 3 ")\
             .replace("2015.", " 2015 . ")\
             .replace("H+", " Transhumanist party ")\
             .replace("Zamasu", " Dragonball character ")\
             .replace("UPSE", " UPSC ")\
             .replace(".What", " . What ")\
             .replace("Irodov", " Difficult Physics book ")\
             .replace("tensorflow", " Google machine learning platform ")\
             .replace("FTRE", " FIITJEE ")\
             .replace("Fiitjee", " FIITJEE ")\
             .replace("SSC-CGL", " Indian Graduate Level Exam ")\
             .replace("CSE.", " CSE .")\
             .replace("uncomfortable.", " uncomfortable . ")\
             .replace("{2", " { 2 ")\
             .replace("'white", " ' white ")\
             .replace("JoSAA", "  Indian University Entrance Test  ")\
             .replace("overweight.", " overweight . ")\
             .replace("[0", " [ 0 ")\
             .replace("Chennai.", " Chennai . ")\
             .replace("₹", " Indian Rupee ")\
             .replace("=2", " = 2 ")\
             .replace("Kalpit", " best participant to JEE ")\
             .replace("Haven't", " Have not ")\
             .replace("crush.", " crush . ")\
             .replace("suicidal.", " suicidal . ")\
             .replace("grades.", " grades . ")\
             .replace("IIITH", " Indian Research University ")\
             .replace("Trump-Russia", " Trump - Russia ")\
             .replace("Codeforces", " website for competive programming contests ")\
             .replace("NLUs", " NLU ")\
             .replace("£1000", " £ 1000 ")\
             .replace("hadn’t", " had not ")\
             .replace("\\sin", " \\ sin ")\
             .split(" ") if j != ""] for i in questions]

def get_vocab(sentences):
    words = set([item for sublist in sentences for item in sublist])
    print("Number of words:", len(words))
    return words

sentences = get_sentences_list(train.question_text)
max_words = np.max([len(i) for i in sentences])
words = get_vocab(sentences)
vocab_size = len(words)

## Make the embedding

In [None]:
def get_embedding_matrix(words_training):
    embedding_matrix = np.zeros((len(words_training), 300))
    mapping = {}
    for index, word in enumerate(words_training):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
            mapping[word] = index
    return (mapping, embedding_matrix)
           
mapping, embedding_matrix = get_embedding_matrix(words)
input_sequences = [[mapping[j] for j in i if j in mapping.keys()] for i in sentences]
vocab_size = len(set([item for sublist in sentences for item in sublist]))

# Assess coverage

In [None]:
print("Number of words with embedding:", len(mapping.keys()))
print("Percentage of embedded:", len(mapping.keys())/vocab_size)

# Prepare train/validation split

In [None]:
from math import ceil

labels = train.target
half = ceil(len(labels)/2)
print("Size training:", half)
print("Insincere train:", np.sum(labels[0:half]))
print("Insincere validation:", np.sum(labels[half+1:len(labels)]))

train_sentences = input_sequences[0:half]
validation_sentences = input_sequences[half+1:len(labels)]
train_labels = train.target[0:half]
validation_labels = train.target[half+1:len(labels)]

# Train the model

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import LSTM
from keras.preprocessing.sequence import pad_sequences

def create_model(vocab_size, embedding_size, max_words, embedding_matrix, input_sequences, labels):
    padded_docs = pad_sequences(input_sequences, maxlen=max_words, padding='post')
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=max_words, 
                        weights=[embedding_matrix], trainable=False))   
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(300))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(padded_docs, labels, epochs=10)
    return model

model = create_model(vocab_size, embedding_size, max_words, embedding_matrix, 
                     train_sentences, train_labels)

In [None]:
from sklearn.metrics import f1_score

trainset = pad_sequences(train_sentences, maxlen=max_words, padding='post')
validationset = pad_sequences(validation_sentences, maxlen=max_words, padding='post')
predictions_train = model.predict(trainset)
predictions_validation = model.predict(validationset)

In [None]:
threshold = np.linspace(0, 0.6, 1000)
score_train = [f1_score(train_labels, 
                        (predictions_train > t).astype(int)) 
               for t in threshold]
score_validation = [f1_score(validation_labels, 
                             (predictions_validation > t).astype(int)) 
                    for t in threshold]

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

plt.plot(threshold, score_train, threshold, score_validation)

In [None]:
print("F1 score best:", np.max(score_validation))
print("Threshold:", threshold[np.argmax(score_validation)])