In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import zipfile 
import gensim
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM,Dropout,Input,Bidirectional,GlobalMaxPool1D,Reshape,Conv1D,Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow_addons as tfa
import tensorflow as tf
trn = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
tst = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
print('Number of Positive Cases:',trn.target.sum())
print('Number of Negative Cases:',trn.shape[0]-trn.target.sum())

In [None]:
num_chars = trn.text.str.len()
num_words = trn.text.str.split().str.len()
fig,axs = plt.subplots(1,2)
axs[0].hist(num_words,edgecolor = 'black',bins = 10)
axs[0].set_xlabel('Number of Words per Case')
axs[0].set_ylabel('Frequency')

axs[1].hist(num_chars,edgecolor = 'black',bins = 25)
axs[1].set_xlabel('Number of Words per Case')
axs[1].set_ylabel('Frequency')
plt.subplots_adjust(bottom=0, left = 0,right=1.5, top=0.66,wspace = 0.25 );


### Tokenize Things

In [None]:
X = trn.text
y = trn.target
X_trn,X_hld,y_trn,y_hld = train_test_split(X,y,test_size = 0.1,random_state = 123123)
X_tst = tst.text

In [None]:
max_vocab_size = 10000
tokenized_vocab = Tokenizer(num_words = max_vocab_size)
tokenized_vocab.fit_on_texts(X_trn)
tokenized_vocab_trn = tokenized_vocab.texts_to_sequences(X_trn)
tokenized_vocab_hld = tokenized_vocab.texts_to_sequences(X_hld)
tokenized_vocab_tst = tokenized_vocab.texts_to_sequences(X_tst)
V = len(tokenized_vocab.word_index)
max_seq_len = 145
print(f'number of unique tokens is {V}')

trn_padded = pad_sequences(tokenized_vocab_trn,maxlen = max_seq_len,padding = 'post')
hld_padded = pad_sequences(tokenized_vocab_hld,maxlen = max_seq_len,padding = 'post')
tst_padded = pad_sequences(tokenized_vocab_tst,maxlen = max_seq_len,padding = 'post')

### Load Embedding Matrix

In [None]:
archive = zipfile.ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip', 'r') 
archive.namelist()
news_path=archive.open('GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', 'r')
word2vec_embeddings = gensim.models.KeyedVectors.load_word2vec_format(news_path, binary=True)

nb_words = V+1
embed_dim = 300
embedding_matrix = np.zeros((nb_words, embed_dim))
word_index = tokenized_vocab.word_index
for word, i in word_index.items():
    if word in word2vec_embeddings.key_to_index:
        embedding_matrix[i] = word2vec_embeddings.get_vector(word)

#### Metrics to Measure

In [None]:
model_metrics = [tf.keras.metrics.AUC(),
                 tf.keras.metrics.BinaryAccuracy(),
                 tfa.metrics.F1Score(num_classes=1, average='macro',threshold=0.5)]

#### Test Set Generation

In [None]:
def get_tst_preds(model,output_name):
    predictions = model.predict(tst_padded)
    predictions=predictions.reshape(predictions.shape[0],)
    final_predictions = pd.concat([tst.id,pd.Series(predictions)],axis = 1)
    final_predictions = final_predictions.rename(columns = {'id':'id',0:'target'})
    final_predictions.loc[final_predictions.target>0.5,'target'] = 1
    final_predictions.loc[final_predictions.target<=0.5,'target'] = 0
    final_predictions.prediction = final_predictions.target.astype('int')
    final_predictions.to_csv(output_name,sep = ',',index = False)
    return(final_predictions)

### Basic Model

In [None]:
input_layer = Input(shape = trn_padded.shape[1])
embed_layer = Embedding(nb_words,embed_dim,weights = [embedding_matrix],input_length = max_seq_len,trainable = False) (input_layer)
lstm_layer = Bidirectional(LSTM(50,dropout = 0.25,return_sequences = False))(embed_layer)
dense_layer = Dense(1,activation = 'sigmoid') (lstm_layer)

build_model = Model(inputs = input_layer, outputs = dense_layer)
build_model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = model_metrics)
build_model.fit(trn_padded,y_trn,batch_size = 128, validation_data = (hld_padded,y_hld),epochs = 10)

In [None]:
fig,axs = plt.subplots(1,3,figsize=(20,5))
axs[0].plot(build_model.history.history['val_f1_score'],label = 'Hold F1')
axs[0].plot(build_model.history.history['f1_score'], label = 'Train F1')

axs[1].plot(build_model.history.history['val_auc'],label = 'Hold AUC')
axs[1].plot(build_model.history.history['auc'], label = 'Train AUC')

axs[2].plot(build_model.history.history['val_loss'],label = 'Hold Loss')
axs[2].plot(build_model.history.history['loss'], label = 'Train Loss')

In [None]:
get_preds = get_tst_preds(build_model,'submission.csv')

In [None]:
]