## 1001 Final Project: Quora Insincere Questions Classification
#### Data: https://www.kaggle.com/c/quora-insincere-questions-classification
#### Sheetal Laad, Elliot Silva, Esteban Navarro, Adrian Pearl

1. Run "Simple Model" grid search on training/validation set
2. Use highest performing "Simple Model" for test set
3. Neural Network and Embedding modeling and evaluation

##### Import necessary packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.feature_extraction import stop_words
from sklearn.metrics import f1_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve
import string, re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
import seaborn as sn
%matplotlib inline

### 1. Run "Simple Model" grid search on training/validation set

In [None]:
data = pd.read_csv("../input/train.csv", index_col = 0)

In [None]:
def data_prep(dataset, training_split, test_split):
    X = dataset['question_text']
    Y = dataset['target']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = training_split, test_size = test_split)
    
    return(X_train, X_test, Y_train, Y_test)

def data_prep_training_nodownsample(dataset, training_split, test_split):
    train, test = train_test_split(dataset, train_size = training_split, test_size = test_split)
    #train_downsampled = downsample(train)
    
    X_train = train['question_text']
    Y_train = train['target']
    X_test = test['question_text']
    Y_test = test['target']
    
    return(X_train, X_test, Y_train, Y_test)


def model_vectorize(data_used, vectorizer_type, binary_type, ngram, stop_word, model_type):
    X_train = data_used[0]
    X_test = data_used[1]
    Y_train = data_used[2]
    Y_test = data_used[3]
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    scores = model.predict_proba(X_test_vectorized)[:,1]
    precision, recall, thresholds = precision_recall_curve(Y_test, scores)
    precision, recall = precision[:-1], recall[:-1]
    fscores = 2*np.divide(np.multiply(precision, recall), np.add(precision, recall))
    max_fscore = np.nanmax(fscores)
    ind_max = fscores.argmax() #not sure this is working
    threshold_max = thresholds[ind_max]
    
    return(max_fscore, threshold_max)


def downsample(df):
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority = df[df.target==1]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=df_minority.shape[0],     # to match minority class
                                     random_state=123) # reproducible results

    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])
    
    return (df_downsampled)

In [None]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'wouldnt':'would not',
                'isnt':'is not',
                'wouldnt':'would not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'neighbour': 'neighbor',
                'humour': 'humor',
                'apologise': 'apologize',
                'travelling':'traveling',
                'counselling':'counseling',
                'recognise': 'recognize',
                'theatre':'theater',
                'cancelled':'canceled',
                'travelled': 'traveled',
                'offence': 'offense',
                'licence': 'license',
                'labour':'labor',
                'behaviour': 'behavior',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social_media',
                'whatsapp': 'social_media',
                'snapchat': 'social_media',
                'facebook': 'social_media'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


def clean(text):
    # Remove puncuation
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    
    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"'m", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)
    text = re.sub('[0-9]{1}', '#', text)
    
    #Replace typical misspells
    text = replace_typical_misspell(text)
    
    #Stem words
#     stemmer = SnowballStemmer('english')
#     text = ' '.join([stemmer.stem(word) for word in text.split(' ')])

    
    #Lemmitize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    
    return text

In [None]:
data['question_text'] = data["question_text"].apply(lambda x: clean(x))
noDS_noStem_lem = data_prep_training_nodownsample(data, .9, .1)

#create models
dataset = [noDS_noStem_lem]
vectorizer_type = [CountVectorizer, TfidfVectorizer]
binary_type = [True, False]
ngram = [(1,2), (1,3), (1,4)]
stop_word = [None]
model_type = [LogisticRegression(), BernoulliNB()]

model_initialize = []
for h in dataset:
    for i in vectorizer_type:
        for j in binary_type:
            for k in ngram:
                for l in model_type:
                    for m in stop_word:
                        model_initialize.append(model_vectorize(h, i, j, k, m, l))
                    
#create labels
dataset_label = ['No Downsample No Stemmed Lemmitized']
vectorizer_type_label = ['CountV', 'TFIDV']
binary_type_label = ['T', 'F']
ngram_label = [(1,2), (1,3), (1,4)]
stop_word_label = ['None']
model_type_label = ['LR', 'NB']

labels = []
for h in dataset_label:    
    for i in vectorizer_type_label:
        for j in binary_type_label:
            for k in ngram_label:
                for l in model_type_label:
                    for m in stop_word_label:
                        label = '%s %s %s %s %s %s' %(h, i, j, k, m, l)
                        labels.append(label)
                        
fscores = []
thresholds = []
for i in model_initialize:
    fscores.append(i[0])
    thresholds.append(i[1])
metrics = pd.DataFrame({'label': labels,'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

### 2. Use highest performing "Simple Model" for test set

##### Functions for Feature Engineering and Modeling

In [None]:
def data_prep_training_nodownsample(dataset, training_split, test_split):
    train, test = train_test_split(dataset, train_size = training_split, test_size = test_split, random_state = 0)
    #train_downsampled = downsample(train)
    
    X_train = train['question_text']
    Y_train = train['target']
    X_test = test['question_text']
    Y_test = test['target']
    
    return(X_train, X_test, Y_train, Y_test)


def model_vectorize(data_used, vectorizer_type, binary_type, ngram, stop_word, c_val, model_type):
    X_train = data_used[0]
    X_test = data_used[1]
    Y_train = data_used[2]
    Y_test = data_used[3]
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    if model_type == LogisticRegression:
        model = model_type(C = 10**c_val)
    else:
        model = model_type()
    
    model.fit(X_train_vectorized, Y_train)
    scores = model.predict_proba(X_test_vectorized)[:,1]
    precision, recall, thresholds = precision_recall_curve(Y_test, scores)
    precision, recall = precision[:-1], recall[:-1]
    fscores = 2*np.divide(np.multiply(precision, recall), np.add(precision, recall))
    max_fscore = np.nanmax(fscores)
    ind_max = fscores.argmax()
    threshold_max = thresholds[ind_max]
    
    return(max_fscore, threshold_max)

##### Functions for Data Cleaning and Preparation

In [None]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'wouldnt':'would not',
                'isnt':'is not',
                'wouldnt':'would not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'neighbour': 'neighbor',
                'humour': 'humor',
                'apologise': 'apologize',
                'travelling':'traveling',
                'counselling':'counseling',
                'recognise': 'recognize',
                'theatre':'theater',
                'cancelled':'canceled',
                'travelled': 'traveled',
                'offence': 'offense',
                'licence': 'license',
                'labour':'labor',
                'behaviour': 'behavior',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social_media',
                'whatsapp': 'social_media',
                'snapchat': 'social_media',
                'facebook': 'social_media'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


def clean(text):
    # Remove puncuation
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    
    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"'m", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)
    text = re.sub('[0-9]{1}', '#', text)
    
    #Replace typical misspells
    text = replace_typical_misspell(text)
    
    #Lemmitize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    
    return text

In [None]:
#Import data
train_data = pd.read_csv('../input/train.csv', index_col = 0)

#Prep data
train_data['question_text'] = train_data["question_text"].apply(lambda x: clean(x))

In [None]:
#Feature Engineering
X, y = train_data.question_text, train_data.target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 0)
cv = CountVectorizer(ngram_range=(1,4), binary=True)
cv.fit(X_train)
X_train, X_val = cv.transform(X_train), cv.transform(X_val)

#Modeling
lr = LogisticRegression().fit(X_train, y_train)
scores = lr.predict_proba(X_val)

pr, re, th = metrics.precision_recall_curve(y_val, scores[:,1])
pr, re, th = pr[:-2], re[:-2], th[:-1]
fs = 2*np.divide(np.multiply(pr, re), np.add(pr, re))

predictions = model.predict_proba(X_test_vectorized)[:,1]
max_threshold = np.max(fs)

label = []
for i in predictions:
    if i<= max_threshold:
        label.append(0)
    else:
        label.append(1)
        
tn, fp, fn, tp = confusion_matrix(y_val, label).ravel()

print('Max Threshold: %f' %max_threshold)
print('TN: %d, FP: %d, FN: %d, TP: %d' %(tn, fp, fn, tp))

In [None]:
sn.set()
sn.palplot(sn.color_palette("RdBu", n_colors=7))
plt.figure(figsize = (9, 6))

plt.figure(figsize=(12,8))
plt.scatter(th[np.argmax(fs)], fs[np.argmax(fs)], color = 'black', label= 'max fscore')
plt.plot(th, pr, label = 'precision')
plt.plot(th, re, label = 'recall')
plt.plot(th, fs, label = 'f score')
plt.title('Best Simple Model')
plt.xlabel('Threshold')
plt.legend()

### 3. Neural Network and Embedding modeling and evaluation

In [None]:
MAX_NB_WORDS = 120000
GLOVE_DIR = "../input/embeddings/glove.840B.300d"
EMBEDDING_DIM = 300
MAX_LEN = 75

#### Read Data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
df = pd.concat([train.drop('target',axis=1),test])
# Comment after debugging, before committing:
#df = df.sample(150000)

#### Feature Selection

In [None]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'))
for line in f:
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [None]:
vocab = build_vocab(df["question_text"])
print("Glove : ")
oov_glove = check_coverage(vocab, embeddings_index)
oov_glove[:10]

In [None]:
contraction_mapping = {"ain't": "is not", 
                       "aren't": "are not",
                       "can't": "cannot", 
                       "'cause": "because", 
                       "could've": "could have",
                       "couldn't": "could not", 
                       "didn't": "did not",  
                       "doesn't": "does not", 
                       "don't": "do not", 
                       "hadn't": "had not", 
                       "hasn't": "has not", 
                       "haven't": "have not", 
                       "he'd": "he would",
                       "he'll": "he will", 
                       "he's": "he is", 
                       "how'd": "how did", 
                       "how'll": "how will", 
                       "how's": "how is",  
                       "I'd": "I would", 
                       "I'll": "I will", 
                       "I'm": "I am", 
                       "I've": "I have", 
                       "i'd": "i would", 
                       "i'd've": "i would have", 
                       "i'll": "i will",  
                       "i'm": "i am", 
                       "i've": "i have", 
                       "isn't": "is not", 
                       "it'd": "it would", 
                       "it'll": "it will", 
                       "it's": "it is", 
                       "let's": "let us", 
                       "might've": "might have",
                       "must've": "must have", 
                       "needn't": "need not", 
                       "o'clock": "of the clock", 
                       "shan't": "shall not", 
                       "she'd": "she would",
                       "she'll": "she will", 
                       "she's": "she is", 
                       "should've": "should have", 
                       "shouldn't": "should not", 
                       "that'd": "that would", 
                       "there's": "there is", 
                       "here's": "here is",
                       "they'd": "they would",
                       "they'll": "they will", 
                       "they're": "they are", 
                       "they've": "they have", 
                       "wasn't": "was not", 
                       "we'd": "we would", 
                       "we'll": "we will", 
                       "we're": "we are", 
                       "we've": "we have", 
                       "weren't": "were not", 
                       "what'll": "what will", 
                       "what're": "what are",  
                       "what's": "what is", 
                       "what've": "what have", 
                       "when's": "when is", 
                       "when've": "when have", 
                       "where'd": "where did", 
                       "where's": "where is", 
                       "where've": "where have", 
                       "who'll": "who will", 
                       "who's": "who is", 
                       "who've": "who have", 
                       "why's": "why is", 
                       "won't": "will not", 
                       "would've": "would have", 
                       "wouldn't": "would not", 
                       "y'all": "you all", 
                       "you'd": "you would", 
                       "you'd've": "you would have", 
                       "you'll": "you will", 
                       "you'll've": "you will have", 
                       "you're": "you are", 
                       "you've": "you have" }
def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known
print("- Known Contractions -")
print("   Glove :")
print(known_contractions(embeddings_index))

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [None]:
df['treated_question'] = df['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [None]:
vocab = build_vocab(df['treated_question'])
print("Glove : ")
oov_glove = check_coverage(vocab, embeddings_index)

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [None]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

print("Glove :")
print(unknown_punct(embeddings_index, punct))

punct_mapping = {"‘": "'", 
                 "₹": "e", 
                 "´": "'", 
                 "°": "", 
                 "€": "e", 
                 "™": "tm", 
                 "√": " sqrt ", 
                 "×": "x", 
                 "²": "2", 
                 "—": "-", 
                 "–": "-", 
                 "’": "'",
                 "_": "-", 
                 "`": "'", 
                 '“': '"', 
                 '”': '"', 
                 '“': '"', 
                 "£": "e", 
                 '∞': 'infinity', 
                 'θ': 'theta', 
                 '÷': '/', 
                 'α': 'alpha', 
                 '•': '.', 
                 'à': 'a', 
                 '−': '-', 
                 'β': 'beta', 
                 '∅': '', 
                 '³': '3', 
                 'π': 'pi'}
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [None]:
df['treated_question'] = df['treated_question'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
vocab = build_vocab(df['treated_question'])
print("Glove : ")
oov_glove = check_coverage(vocab, embeddings_index)
oov_glove[:100]

In [None]:
mispell_dict = {'Quorans': 'Quora',
                'cryptocurrencies': 'cryptocurrency',
                'Redmi' :'cellphone',
                'OnePlus' :'cellphone',
                'Blockchain':'blockchain',
                'Pokémon':'Pokemon',
                'ethereum':'Ethereum',
                'Qoura':'Quora',
                'fiancé':'fiance',
                'Litecoin': 'cryptocurrency',
                'altcoin': 'cryptocurrency',
                'Cryptocurrency': 'cryptocurrency',
                'altcoins' : 'cryptocurrency',
                'litecoin' : 'cryptocurrency',
                'colour': 'color',
                'centre': 'center',
                'favourite': 'favorite',
                'travelling': 'traveling',
                'counselling': 'counseling',
                'theatre': 'theater',
                'cancelled': 'canceled',
                'labour': 'labor',
                'organisation': 'organization',
                'wwii': 'world war 2',
                'citicise': 'criticize',
                'youtu ': 'youtube ',
                'Qoura': 'Quora',
                'sallary': 'salary',
                'Whta': 'What',
                'narcisist': 'narcissist',
                'howdo': 'how do',
                'whatare': 'what are',
                'howcan': 'how can',
                'howmuch': 'how much',
                'howmany': 'how many',
                'whydo': 'why do',
                'doI': 'do I',
                'theBest': 'the best',
                'howdoes': 'how does',
                'mastrubation': 'masturbation',
                'mastrubate': 'masturbate',
                "mastrubating": 'masturbating',
                'pennis': 'penis',
                'Etherium': 'Ethereum',
                'narcissit': 'narcissist',
                'bigdata': 'big data', 
                '2k17': '2017', 
                '2k18': '2018', 
                'qouta': 'quota', 
                'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', 
                "whst": 'what', 'watsapp': 'whatsapp', 
                'demonitisation': 'demonetization', 
                'demonitization': 'demonetization', 
                'demonetisation': 'demonetization'}

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

df['treated_question'] = df['treated_question'].apply(lambda x: correct_spelling(x, mispell_dict))

vocab = build_vocab(df['treated_question'])
print("Glove : ")
oov_glove = check_coverage(vocab, embeddings_index)

#### Modeling

In [None]:
y = pd.read_csv('../input/train.csv')['target']
X = df[:len(y)]['treated_question']

%%time
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# tokenizer: words --> word indices

texts, labels = X, y

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X))
sequences = tokenizer.texts_to_sequences(X)

sequences = pad_sequences(sequences, maxlen=MAX_LEN)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

# Example sequence
sequences[0]

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(sequences, labels, test_size=0.2)
y_train_bin = to_categorical(y_train)
y_val_bin = to_categorical(y_val)

print('Vector for \'apple\' =')
print(str(embeddings_index['apple'][0:10]) + '... plus 290 more features')

In [None]:
%%time
# embedding_matrix: word index --> word vector
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
ai = word_index['apple']
print('\'apple\' is word number: ', ai)
print('The vector for word number ', ai, ' is:')
print(str(embedding_matrix[ai][0:10]) + '... plus 290 more features')

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_LEN,
                            trainable=False)

In [None]:
from keras.models import Sequential
from keras.layers import Input, Bidirectional, CuDNNGRU, GlobalMaxPool1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.models import Model
from keras import regularizers

model = Sequential()
#model.add(Input(shape=(MAX_LEN,), dtype='int32'))
model.add(embedding_layer)
model.add(Bidirectional(CuDNNGRU(64, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(32, activation="sigmoid"))
model.add(Dropout(0.3))
model.add(Dense(32, activation="sigmoid"))
model.add(Dropout(0.3))
model.add(Dense(2, activation="softmax"))

#model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
model.summary()

In [None]:
from keras import callbacks
from sklearn import metrics
class cbmetrics(callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        scores = self.model.predict(X_val, batch_size=1024, verbose=1)[:,1]
        pr, re, th = metrics.precision_recall_curve(y_val, scores)
        pr, re, th = pr[:-2], re[:-2], th[:-1]
        fs = 2*np.divide(np.multiply(pr, re), np.add(pr, re))
        i = np.argmax(fs)
        self.val_f1s.append(np.max(fs))
        self.val_recalls.append(re[i])
        self.val_precisions.append(pr[i])
        print("f score: ", np.max(fs))
        
f1 = cbmetrics()
history = model.fit(X_train, y_train_bin, validation_data=(X_val, y_val_bin),
          epochs=3, batch_size=256, verbose=1, callbacks=[f1])

In [None]:
# Get training and test loss histories
training_loss = history.history['loss']
val_loss = history.history['val_loss']

# Create count of the number of epochs
epoch_count = range(1, len(training_loss) + 1)

# Visualize loss history
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, val_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show();

In [None]:
scores = model.predict(X_val, batch_size=1024, verbose=1)[:,1]

In [None]:
from sklearn import metrics
pr, re, th = metrics.precision_recall_curve(y_val, scores)
pr, re, th = pr[:-2], re[:-2], th[:-1]
fs = 2*np.divide(np.multiply(pr, re), np.add(pr, re))

plt.figure(figsize=(12,8))
plt.plot(th, pr, label = 'precision')
plt.plot(th, re, label = 'recall')
plt.plot(th, fs, label = 'f score')
plt.xlabel('Threshold')
plt.legend()

In [None]:
opt_thr = th[np.argmax(fs)]
print(opt_thr, np.max(fs))