# Load data

In [None]:
filename_train = '../input/nlp-getting-started/train.csv'
filename_test = '../input/nlp-getting-started/test.csv'

In [None]:
import pandas as pd

df_train = pd.read_csv(filename_train, index_col='id')
df_train

In [None]:
df_test = pd.read_csv(filename_test, index_col='id')
df_test

# Preprocessing

## Clean text

In [None]:
import re, string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df_train['text'] = df_train['text'].apply(lambda x: clean_text(x))
df_test['text'] = df_test['text'].apply(lambda x: clean_text(x))

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
df_train['text'] = df_train['text'].apply(lambda x: ' '.join([lem.lemmatize(token) for token in x.split()]))
df_test['text'] = df_test['text'].apply(lambda x: ' '.join([lem.lemmatize(token) for token in x.split()]))

## Get explanatory variable

In [None]:
from nltk.tokenize import word_tokenize

def create_corpus(texts):
    """Decompose text to corpus (e.g. `This is a pen` to [ `This`, `is`, `a`, `pen` ])
    
    Arguments:
        texts: list(str) / Text list.
        
    Returns:
        list(str) / Corpus list.
    """
    
    corpus = []
    for tweet in texts:
        words = [ word.lower() for word in word_tokenize(tweet) ]
        corpus.append(words)
        
    return corpus

In [None]:
corpus_train = create_corpus(df_train['text'])
corpus_test = create_corpus(df_test['text'])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Torkenize corpus to integer list
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus_train)
seq_train = tokenizer.texts_to_sequences(corpus_train)
seq_test = tokenizer.texts_to_sequences(corpus_test)

In [None]:
word_index = tokenizer.word_index
print('Number of unique words:',len(word_index))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 50
X_train = pad_sequences(seq_train, maxlen=num_words)
X_test = pad_sequences(seq_test, maxlen=num_words)

## Get objective variable

In [None]:
y_train = df_train['target']
y_train

### Get GloVe embedding matrix

In [None]:
import numpy as np

embedding_dict = {}
with open('../input/glove6b/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors

In [None]:
max_words = len(word_index) + 1
embedding_dims = 100

embedding_matrix = np.zeros((max_words, embedding_dims))
for word, i in word_index.items():
    if i > max_words:
        continue
        
    emb_vec = embedding_dict.get(word)    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec            

# Modeling

In [None]:
import tensorflow as tf

input = tf.keras.layers.Input(shape=(num_words,))

x = input
x = tf.keras.layers.Embedding(
    max_words, embedding_dims, 
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    input_length=num_words, trainable=False)(x)
x = tf.keras.layers.LSTM(64)(x)
x = tf.keras.layers.Dropout(0.2)(x)

output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(input, output)
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
epochs = 100
batch_size = 64

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
]

model.fit(X_train, 
          y_train, 
          epochs=epochs,
          batch_size=batch_size,
          validation_split=0.1,
          callbacks=callbacks)

# Prediction

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
y_pred = np.where(y_pred > 0.5, 1, 0).squeeze()
y_pred = pd.Series(y_pred, name='target').astype(int)
y_pred

In [None]:
answer = pd.concat([df_test.index.to_series().reset_index(drop=True), y_pred], axis=1)
answer

In [None]:
filename_output = './submission.csv'
answer.to_csv(filename_output, index=False)