In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Target Variable Exploration

In [None]:
train_data = pd.read_csv("../input/nlp-getting-started/train.csv")
train_data.head(5)

In [None]:
test_data = pd.read_csv("../input/nlp-getting-started/test.csv")
test_data.head(5)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
sns.countplot(train_data['target'])

# Data Preprocessing

## Text Cleaning and Preprocessing

In [None]:
!pip install BeautifulSoup4

In [None]:
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
import unicodedata
import html

# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_tweets(text):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # strip html
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

train_data['prep_text'] = train_data['text'].apply(clean_tweets)
train_data['prep_text'].head(5)

In [None]:
test_data['text'] = test_data['text'].apply(clean_tweets)
test_data['text'].head(5)

# Text Representation

## Text One-Hot Encoding

In [None]:
from keras.preprocessing.text import Tokenizer # Text tokenization

# Setting up the tokenizer
vocab_size = 1000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')
tokenizer.fit_on_texts(list(train_data['prep_text']) + list(test_data['text']))

In [None]:
# Representing texts as one hot encoded sequence

X_train_ohe = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'binary')
X_test_ohe = tokenizer.texts_to_matrix(test_data['text'], mode = 'binary')
y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape: {X_train_ohe.shape}")
print(f"X_test shape: {X_test_ohe.shape}")
print(f"y_train shape: {y_train.shape}")

## Modeling on a simple Neural Network

## Train Validation Split

In [None]:
from sklearn.model_selection import train_test_split
X_train_ohe, X_val_ohe, y_train, y_val = train_test_split(X_train_ohe, y_train, random_state = 42, test_size = 0.2)

print(f"X_train shape: {X_train_ohe.shape}")
print(f"X_val shape: {X_val_ohe.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

## Setting up the model

In [None]:
from keras.models import Sequential
from keras import layers, metrics, optimizers, losses

def setup_model():
    
    model = Sequential()
#     model.add(layers.Dense(16, activation='relu', input_shape=(vocab_size,)))
#     model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid', input_shape=(vocab_size,)))
    
    model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])
    
    return model

model = setup_model()
model.summary()

In [None]:
history = model.fit(X_train_ohe, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_ohe, y_val))

In [None]:
_, accuracy = model.evaluate(X_val_ohe, y_val)

## Learning Curves

In [None]:
import matplotlib.pyplot as plt

def plot_history(history): 

    history_dict = history.history
    history_dict.keys()


    acc = history.history['binary_accuracy']
    val_acc = history.history['val_binary_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    # "bo" is for "blue dot"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    # b is for "solid blue line"
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()
    
plot_history(history)

## Word-Count Representation

In [None]:
X_train_wc = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'count')
X_test_wc = tokenizer.texts_to_matrix(test_data['text'], mode = 'count')
y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape: {X_train_wc.shape}")
print(f"X_test shape: {X_test_wc.shape}")
print(f"y_train shape: {y_train.shape}")


## Train Test Split

In [None]:
X_train_wc, X_val_wc, y_train, y_val = train_test_split(X_train_wc, y_train, random_state = 42, test_size = 0.2)

print(f"X_train shape: {X_train_wc.shape}")
print(f"X_val shape: {X_val_wc.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

## Learning on the Same Architecture

In [None]:
model = setup_model()
model.summary()

In [None]:
history = model.fit(X_train_wc, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_wc, y_val))

In [None]:
_, accuracy = model.evaluate(X_val_wc, y_val)

## Learning Curves

In [None]:
plot_history(history)

## Term Frequency Representation

In [None]:
X_train_freq = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'freq')
X_test_freq = tokenizer.texts_to_matrix(test_data['text'], mode = 'freq')
y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape: {X_train_freq.shape}")
print(f"X_test shape: {X_test_freq.shape}")
print(f"y_train shape: {y_train.shape}")

In [None]:
X_train_freq, X_val_freq, y_train, y_val = train_test_split(X_train_freq, y_train, test_size = 0.2, random_state = 42)
print(f"X_train shape: {X_train_freq.shape}")
print(f"X_val shape: {X_val_freq.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

## Training on the same architecture

In [None]:
model = setup_model()
model.summary()

In [None]:
history = model.fit(X_train_freq, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_freq, y_val))

## Learning Curves

In [None]:
plot_history(history)

In [None]:
train_data.head()

## Using TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # Term Frequency - Inverse Document Frequency

vectorizer = TfidfVectorizer(max_features = vocab_size)
vectorizer.fit(list(train_data['prep_text']) + list(test_data['text']))

# Fitting on training and testing data
X_train_tfidf = vectorizer.transform(list(train_data['prep_text'])).toarray() 
X_test_tfidf = vectorizer.transform(list(test_data['text'])).toarray()

y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape {X_train_tfidf.shape}")
print(f"X_test shape {X_test_tfidf.shape}")
print(f"y_train shape {y_train.shape}")

## Train Validation Split

In [None]:
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size = 0.2, random_state = 42)
print(f"X_train shape: {X_train_tfidf.shape}")
print(f"X_val shape: {X_val_tfidf.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

## Training on the same architecture

In [None]:
model = setup_model()
model.summary()

In [None]:
history = model.fit(X_train_tfidf, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_tfidf, y_val))

## Learning Curves

In [None]:
plot_history(history)

# Using twitter GloVE embeddings

## Sequence Length Analysis

In [None]:
plt.hist(list(train_data['prep_text'].str.split().map(lambda x: len(x))))

In [None]:
# Loading the embedding dictionary from file

embedding_dict={}
with open('../input/glovetwitter27b100dtxt/glove.twitter.27B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
# Sequences creation, truncation and padding

from keras.preprocessing.sequence import pad_sequences

# Setting up the tokenizer
vocab_size = 10000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')
tokenizer.fit_on_texts(list(train_data['prep_text']) + list(test_data['text']))

max_len = 15
X_train_seq = tokenizer.texts_to_sequences(train_data['prep_text'])
X_test_seq = tokenizer.texts_to_sequences(test_data['text'])

X_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')
X_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')
y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape: {X_train_seq.shape}")
print(f"X_test shape: {X_test_seq.shape}")
print(f"y_train shape: {y_train.shape}")

## Train Validation Split

In [None]:
X_train_seq, X_val_seq, y_train, y_val = train_test_split(X_train_seq, y_train, test_size = 0.2, random_state = 42)
print(f"X_train shape: {X_train_seq.shape}")
print(f"X_val shape: {X_val_seq.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

In [None]:
num_words = len(tokenizer.word_index)
print(f"Number of unique words: {num_words}")

In [None]:
# Applying GloVE representations on our corpus

embedding_matrix=np.zeros((num_words,100))

for word,i in tokenizer.word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec    

## Setting up a model with the embeddings layer

In [None]:
# Setting up the model

n_latent_factors = 100
model_glove = Sequential()
model_glove.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], 
                           input_length = max_len, trainable=True))
model_glove.add(layers.Flatten())
# model_glove.add(layers.Dense(16, activation='relu'))
model_glove.add(layers.Dropout(0.5))
# model_glove.add(layers.Dense(16, activation='relu'))
model_glove.add(layers.Dense(1, activation='sigmoid'))
model_glove.summary()

In [None]:
model_glove.compile(optimizer = optimizers.RMSprop(lr=0.001),
              loss = losses.binary_crossentropy,
              metrics = [metrics.binary_accuracy])

history = model_glove.fit(X_train_seq,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(X_val_seq, y_val))

## Learning Curves

In [None]:
plot_history(history)

## Training for Submission

In [None]:
max_len = 15
X_train_seq = tokenizer.texts_to_sequences(train_data['prep_text'])
X_test_seq = tokenizer.texts_to_sequences(test_data['text'])

X_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')
X_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')
y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape: {X_train_seq.shape}")
print(f"X_test shape: {X_test_seq.shape}")
print(f"y_train shape: {y_train.shape}\n")

# Setting up the model

n_latent_factors = 100
model_glove = Sequential()
model_glove.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], 
                           input_length = max_len, trainable=True))
model_glove.add(layers.Flatten())
# model_glove.add(layers.Dense(16, activation='relu'))
model_glove.add(layers.Dropout(0.5))
# model_glove.add(layers.Dense(16, activation='relu'))
model_glove.add(layers.Dense(1, activation='sigmoid'))
print(f"{model_glove.summary()}\n")


model_glove.compile(optimizer = optimizers.RMSprop(lr=0.001),
              loss = losses.binary_crossentropy,
              metrics = [metrics.binary_accuracy])

history = model_glove.fit(X_train_seq,
                    y_train,
                    epochs=20,
                    batch_size=512)

## The effect of text preprocessing

In [None]:
# Setting up the tokenizer
vocab_size = 1000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')
tokenizer.fit_on_texts(list(train_data['text']) + list(test_data['text']))

# Word count representation
X_train_wc = tokenizer.texts_to_matrix(train_data['text'], mode = 'count')
X_test_wc = tokenizer.texts_to_matrix(test_data['text'], mode = 'count')
y_train = np.array(train_data['target']).astype(int)

print(f"X_train shape: {X_train_wc.shape}")
print(f"X_test shape: {X_test_wc.shape}")
print(f"y_train shape: {y_train.shape}")

# Train Validation Split
X_train_wc, X_val_wc, y_train, y_val = train_test_split(X_train_wc, y_train, test_size = 0.2, random_state = 42)

print(f"X_train shape: {X_train_wc.shape}")
print(f"X_val shape: {X_val_wc.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}\n")

# Setting up the model
model = setup_model()

# Fitting the model on un-preprocessed text
history = model.fit(X_train_wc, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_wc, y_val))

It turns out the model overfits because of the noise of the text like stopwords, punctuation, un-stemmed words, etc.

# Final Submission

In [None]:
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
test_pred = model_glove.predict(X_test_seq)
test_pred_int = test_pred.round().astype('int')
submission['target'] = test_pred_int
submission.to_csv('submission.csv', index=False)