### 1. Read data
Read the data from CSV and apply some basic pre-processing (remove non-ascii characters, convert our target variable to an integer label).

In [None]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from imblearn.under_sampling import RandomUnderSampler

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

In [None]:
path = r"train.csv"

In [None]:
df = pd.read_csv(path)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
docs = df["question_text"].values
labels = df["target"].values

In [None]:
np.unique(labels,return_counts=True)

### 2. Preprocessing
Tokenize text, convert words / tokens to indexed integers. Take each document and convert to a sequence of max length 20 (pad with zeroes if shorter).

In [None]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)

In [None]:
# t.word_index

In [None]:
# encoded_docs

In [None]:
# text_lens=[]
# for title in docs:
#     text_lens.append(len(word_tokenize(title)))

In [None]:
# max(text_lens)

In [None]:
# np.quantile(text_lens,0.85)

In [None]:
# pad documents to a max length of 4 words
max_length = 20
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

In [None]:
padded_docs.shape

In [None]:
rus = RandomUnderSampler(random_state=42)
padded_docs_rus,labels_rus = rus.fit_resample(padded_docs,labels)

In [None]:
padded_docs_rus.shape,labels_rus.shape,padded_docs.shape,labels.shape

In [None]:
np.unique(labels_rus,return_counts=True)

### 3. Import embeddings
The clever part: import a dictionary of word embeddings that translates each word into a 100 dimensional vector.

In [None]:
# load the whole embedding into memory
EMBEDDING_FILE = r"glove.840B.300d.txt"
EMBEDDING_DIM = 100

def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE,'r', errors = 'ignore', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
        except:
            pass
            
    f.close()
    return embeddings_index

embeddings_index = get_embedding()

In [None]:
# create a weight matrix for words in training docs

def create_embedding_weights(vocab_size,t):
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(embedding_matrix.shape)
    return embedding_matrix



In [None]:
embedding_matrix = create_embedding_weights(vocab_size,t)

### 4. Network architecture

##### Simple model

In [None]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

##### LSTM model

In [None]:
## create model
model_glove = Sequential()
model_glove.add(Embedding(vocab_size, 300, input_length=20, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_glove.summary()

### 5. Training and Evaluation
Is it any good? Let's find out.
Divide our dataset using a holdout strategy:

In [None]:
# split dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs_rus, labels_rus, test_size=0.2, random_state=42)

In [None]:
# fit the model
model_glove.fit(X_train, y_train, epochs=5, verbose=0)

In [None]:
# evaluate the model
loss, accuracy = model_glove.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
model_glove.predict(X_test)

In [None]:
ques = ['Has the United States become the largest dictatorship in the world?','How should I prepare for IIT K/IIM C/ ISI K PGDBA course exam and interview?']

In [None]:
t.fit_on_texts(ques)
# integer encode the documents
encoded_ques = t.texts_to_sequences(ques)
max_length = 20
padded_ques = pad_sequences(encoded_ques, maxlen=max_length, padding='post')
print(len(padded_ques))

In [None]:
model_glove.predict(padded_ques)