# **Import files**

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.downloader

In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

In [1]:
##Importing Libraries for Neural Nets
import tensorflow as tf
import keras
from keras.models import Sequential, model_from_json
from keras import layers
from keras.layers import Input, Dense, Dropout, Activation, LSTM, GRU, GlobalAvgPool1D, GlobalMaxPool1D
import math
from tqdm import tqdm
import pickle 
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from tqdm.keras import TqdmCallback

In [1]:
!pip install keras-self-attention
from keras_self_attention import SeqSelfAttention

# **Download embeddings**

In [1]:
# download glove twitter embeddings
pprint(list(gensim.downloader.info()['models'].keys()))

In [1]:
# Takes about 5 minutes to execute, for 100-dim twitter vectors
# Takes about 10+ minutes to execute, for 200-dim twitter vectors
# glove_vectors_100 = gensim.downloader.load('glove-twitter-100')
glove_vectors = gensim.downloader.load('glove-twitter-100')
glove = glove_vectors
embedding_length = 100

# **Pre-process the text**

In [1]:
path = '/kaggle/input/nlp-getting-started/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
print(train.shape)
print(test.shape)

text_train = np.array(train['text'])
text_test = np.array(test['text'])
text = np.concatenate((text_train, text_test), axis = 0)
print(text.shape)

In [1]:
# word tokenization 
for i in range(len(text)):
    text[i] = word_tokenize(text[i])
print('After tokenization:')
print(text[0])

# filter out punctuation
for i in range(len(text)):
    text[i] = [word for word in text[i] if word.isalpha()]
print('After filtering out punctuation:')
print(text[0])

# make words lowercase 
for i in range(len(text)):
    text[i] = [word.lower() for word in text[i]]
print('After making lowercase:')
print(text[0])

# remove stopwords
for i in range(len(text)):
    text[i] = [word for word in text[i] if not word in stop_words]
print('After removing stopwords:')
print(text[0])

# concatenate list of words
for i in range(len(text)):
    text_concat = ''
    for word in text[i]:
        text_concat += word + ' '
    text[i] = text_concat
print('After concatenating words:')
text = np.array(text)

# **Get embeddings for each word (averaging)**





In [1]:
glove = glove_vectors

In [1]:
embeddings = []
for sentence in text:
    embedding = np.zeros(100)
    word_count = 0
    for word in sentence:
#         if word in glove.vocab:
        if word in glove.key_to_index:
            embedding += glove.get_vector(word)
        word_count += 1
    if word_count != 0:
        embedding /= word_count
    embeddings.append(embedding)
embeddings = np.array(embeddings)

In [1]:
embeddings.shape

In [1]:
X_train = embeddings[:7613]
X_test = embeddings[7613:] 

y_train = train['target'].values

In [1]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

# **Create Model**
#### (Average the embeddings for each word of the tweet and learn to classify using a feedforward NN model)

In [1]:
act = 'tanh'
batch_len = 32
opt = 'adam'
epoch = 10
val_split = 0.2

In [1]:
keras.backend.clear_session()
inputs = keras.Input(shape = (X_train.shape[1]))
x = layers.Dense(32, activation = act)(inputs)
x = layers.Dense(32, activation = act)(x)
x = layers.Dense(16, activation = act)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs = inputs, outputs = outputs, name = 'Deep-Averaging-Network')

In [1]:
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = epoch, batch_size = batch_len, verbose = 1, validation_split=val_split)

# **Get embeddings for each word (concatenate)**

In [1]:
embeddings = []
for sentence in text:
    embedding = []
    for word in sentence.split():
        if word in glove.key_to_index:
            embedding.extend(glove.get_vector(word))
        else:
            embedding.extend(np.zeros(100).tolist())
    # pad extra zeros to make length of each embedding = 2200
    if len(embedding) < 2200:
        padding_len = 2200 - len(embedding)
        embedding.extend(np.zeros(padding_len).tolist())
    embedding = np.array(embedding)
    embeddings.append(embedding)
embeddings = np.array(embeddings)

In [1]:
X_train = embeddings[:7613]
X_test = embeddings[7613:] 
y_train = train['target'].values

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

# **Create Model (Concatenated Embeddings Model)**

In [1]:
act = 'tanh'
batch_len = 32
opt = 'adam'
epoch = 10
val_split = 0.2

In [1]:
keras.backend.clear_session()
inputs = keras.Input(shape = (X_train.shape[1]))
x = layers.Dense(32, activation = act, input_dim = X_train.shape[1])(inputs)
x = layers.Dropout(0.3)(x)
x = layers.Dense(32, activation = act)(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(16, activation = act)(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs = inputs, outputs = outputs, name = 'Glove-FFN-Concatenated')

In [1]:
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = epoch, batch_size = batch_len, verbose = 1, validation_split=val_split)

# **Create Embeddings (for LSTM)**

Understanding input sizes for using LSTM: https://stackoverflow.com/questions/50418973/how-lstm-work-with-word-embeddings-for-text-classification-example-in-keras

In [1]:
embeddings = []
for sentence in text:
    embedding = []
    for word in sentence.split():
        if word in glove.key_to_index:
            embedding.append(glove.get_vector(word).tolist())
        else:
            embedding.append(np.zeros(embedding_length).tolist())
    # pad extra zeros to make length of each sentence = 22 
    while len(embedding) < 22:
        embedding.append(np.zeros(embedding_length).tolist())
    embeddings.append(embedding)
embeddings = np.array(embeddings)
print(embeddings.shape)

In [1]:
X_train = embeddings[:7613]
X_test = embeddings[7613:]
y_train = train['target'].values

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

# **Create and train LSTM model**
#### (Using only the text feature)

In [1]:
act = 'tanh'
batch_len = 32
opt = 'adam'
epoch = 10
val_split = 0.2

In [1]:
keras.backend.clear_session()
inputs = keras.Input(shape = (22, embedding_length))
x = layers.LSTM(64)(inputs)
x = layers.Dense(32, activation = act)(x)
x = layers.Dense(16, activation = act)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs = inputs, outputs = outputs, name = 'Glove-LSTM')

In [1]:
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = epoch, batch_size = batch_len, verbose = 1, validation_split=val_split)

# **Create and train GRU model**

In [1]:
act = 'tanh'
batch_len = 16
opt = 'adam'
epoch = 10
val_split = 0.2

In [1]:
keras.backend.clear_session()
inputs = keras.Input(shape = (22, embedding_length))
x = layers.GRU(64)(inputs)
x = layers.Dense(32, activation = act)(x)
x = layers.Dense(16, activation = act)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs = inputs, outputs = outputs, name = 'Glove-GRU')

In [1]:
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = epoch, batch_size = batch_len, verbose = 1, validation_split=val_split)

# **LSTM model with self-attention**

In [1]:
act = 'tanh'
batch_len = 16
opt = 'adam'
epoch = 10
val_split = 0.2

In [1]:
keras.backend.clear_session()
inputs = keras.Input(shape = (22, embedding_length))
x = layers.GRU(64, return_sequences = True)(inputs)
x = SeqSelfAttention(attention_activation = 'tanh')(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dense(32, activation = act)(x)
x = layers.Dense(16, activation = act)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs = inputs, outputs = outputs, name = 'GRU-self-attention')

In [1]:
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs = epoch, batch_size = batch_len, verbose = 1, validation_split=val_split)

# **Preparing "keyword" feature**

In [1]:
train['keyword'].unique()

In [1]:
## TRAIN DATA "KEYWORDS" PROCESSING
# replace "%20" with " "
keywords = train['keyword'].fillna('none').replace('%20', ' ', regex = True).tolist()

# replace keywords with embeddings
keyword_embeddings = []
for keyword in keywords:
    embedding = np.zeros(embedding_length)
    word_count = 0
    for word in keyword.split():
        if word in glove.key_to_index: 
            embedding += glove.get_vector(word)
            word_count += 1
    if word_count != 0:
        embedding /= word_count
    keyword_embeddings.append(embedding)

keyword_embeddings_train = np.array(keyword_embeddings)

## TEST DATA "KEYWORDS" PROCESSING
# replace "%20" with " "
keywords = test['keyword'].fillna('none').replace('%20', ' ', regex = True).tolist()

# replace keywords with embeddings
keyword_embeddings = []
for keyword in keywords:
    embedding = np.zeros(embedding_length)
    word_count = 0
    for word in keyword.split():
        if word in glove.key_to_index: 
            embedding += glove.get_vector(word)
            word_count += 1
    if word_count != 0:
        embedding /= word_count
    keyword_embeddings.append(embedding)

keyword_embeddings_test = np.array(keyword_embeddings)

In [1]:
print(keyword_embeddings_train.shape)
print(keyword_embeddings_test.shape)

# **Preparing "location" feature**

In [1]:
print(train['location'].nunique())
train['location'].unique()

In [1]:
## TRAIN DATA "KEYWORDS" PROCESSING
locations = train['location'].fillna('none').replace('[^a-zA-Z ]', ' ', regex = True).tolist()

# replace locations with embeddings
location_embeddings = []
for location in locations:
    embedding = np.zeros(embedding_length)
    word_count = 0
    for word in location.split():
        if word in glove.key_to_index: 
            embedding += glove.get_vector(word)
            word_count += 1
    if word_count != 0:
        embedding /= word_count
    location_embeddings.append(embedding)

location_embeddings_train = np.array(location_embeddings)

## TEST DATA "KEYWORDS" PROCESSING
locations = test['location'].fillna('none').replace('[^a-zA-Z ]', ' ', regex = True).tolist()

# replace locations with embeddings
location_embeddings = []
for location in locations:
    embedding = np.zeros(embedding_length)
    word_count = 0
    for word in location.split():
        if word in glove.key_to_index: 
            embedding += glove.get_vector(word)
            word_count += 1
    if word_count != 0:
        embedding /= word_count
    location_embeddings.append(embedding)

location_embeddings_test = np.array(location_embeddings)

In [1]:
print(location_embeddings_train.shape)
print(location_embeddings_test.shape)

In [1]:
# concatenate "keyword" and location features
non_sequential_train = np.concatenate((keyword_embeddings_train, location_embeddings_train), axis = 1)
non_sequential_test = np.concatenate((keyword_embeddings_test, location_embeddings_test), axis = 1)
print(non_sequential_train.shape)
print(non_sequential_test.shape)

# **Non-linear LSTM model (with both seq and non-seq inputs)**

In [1]:
act = 'tanh'
batch_len = 16
opt = 'adam'
epoch = 20
val_split = 0.2

In [1]:
keras.backend.clear_session()
seq_input = keras.Input(shape = (22, embedding_length))
non_seq_input = keras.Input(shape = (2 * embedding_length))
x = layers.GRU(64, return_sequences = True)(seq_input)
x = SeqSelfAttention(attention_activation = 'tanh')(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.concatenate([x, non_seq_input])
x = layers.Dense(64, activation = act)(x)
x = layers.Dense(16, activation = act)(x)
output = layers.Dense(1)(x)
model = keras.Model(inputs = [seq_input, non_seq_input], outputs = output, name = 'complete_model')

In [1]:
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

In [1]:
model.fit([X_train, non_sequential_train], y_train, epochs = epoch, batch_size = batch_len, verbose = 1, validation_split=val_split)

In [1]:
tf.keras.utils.plot_model(model, show_shapes = True)

# **Get results for test set and generate CSV**

In [1]:
# y_test = model.predict(X_test)
y_test = model.predict([X_test, non_sequential_test])
y_pred = []
for i in range(len(y_test)):
    if y_test[i][0] > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)
test['target'] = y_pred
final = test[['id', 'target']]
final.to_csv('pred.csv', index = False)
final