# Import Modules

In [None]:
import numpy as np
import pandas as pd
pd.set_option("max_colwidth", -1)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import nltk
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

import tensorflow as tf
import keras
from keras import preprocessing

from sklearn.model_selection import KFold

import string
import time
import os ,re
import functools
from collections import Counter

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Dataset

In [None]:
train_tweet = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_tweet = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# I prefer indexing train and test with dictionary
# rather than define train and tweet one by one
df_map = {"train": train_tweet, "test": test_tweet}

# Exploratory Data Analysis (EDA)

### Get Dataset Information

In [None]:
def get_df_info(df_map):
    for key, df in df_map.items():
        print("{} dataframe info: ".format(key))
        print("-" * 30)
        print(df.info(), end="\n\n")
        
        
def check_null_values(df_map):
    for key, df in df_map.items():
        print("{} dataframe missing values: ".format(key))
        print("-" * 30)
        print(df.isnull().sum(axis=0), end="\n\n")

In [None]:
get_df_info(df_map)

In [None]:
check_null_values(df_map)

There were no missing values on text columns, we're good to go

### Class Distribution

In [None]:
def plot_class_count(train):
    
    fig, ax = plt.subplots(figsize=(5, 5))
    plt.suptitle("Class count")
    sns.countplot(x="target", data=train, ax=ax)
        
    
plot_class_count(train_tweet)

### Common Words

In [None]:
def word_list(sentences):
    result = [word.lower() for sentence in sentences 
                           for word in word_tokenize(sentence)]
    return result

def create_corpus(train_tweet):
    df = train_tweet.copy()
        
    # convert pandas series => list of words
    class_0_corpus = word_list(df.query("target == 0")["text"]) 
    class_1_corpus = word_list(df.query("target == 1")["text"])

    return class_0_corpus, class_1_corpus

In [None]:
# Define corpus, punc_list and stopwords
class_0_corpus, class_1_corpus = create_corpus(train_tweet)
punc_list = [punct for punct in string.punctuation]
stop = list(stop)

In order to plot the common words, I will use **decorator**. **Decorator** is used to expand our defined function without explicitly modify it. Just bear with me, it may looks complicated but the concept is simple. 

Note: (Skip to the plotting section if you already understand Decorator)

### Decorator Mini Guide
Let's say you want to measure how long your function running time. You will do the following...

In [None]:
def sum_all():
    start = time.time()
    
    result = 0
    for num in range(100):
        result += num
    
    end = time.time()
    print("Elapsed time: {:4f} sec".format(end - start))
    print("The sum is {}".format(result))


sum_all()

Quite simple right? What if, you want to measure the execution time on another function?

In [None]:
def mul_all():
    start = time.time()
    
    result = 0
    for num in range(100):
        result *= num
    
    end = time.time()
    print("Elapsed time: {:4f} sec".format(end - start))
    print("The product is {}".format(result))
    
mul_all()

You must copy-paste those start and end variable into the new function. Imagine if you do that on another 10 functions. Time waster isn't it? Now this is the time for **Decorator** to shine. This is how you create a simple decorator.

Step 1. Build Wrapper Function

In [None]:
# Create the wrapper function first, 
# :parameter: func here is our base function like: sum_all and mul_all
def simple_decorator(func):
    
    # 2. Our function will goes inside here
    def decorated():
        start = time.time()  # 3. Timer start
        result = func()      # 4. Function Executed
        end = time.time()    # 5. Timer end
        
        print("{} function".format(func.__name__))
        print("Elapsed time: {:4f} sec".format(end - start))
        print("The result is {}".format(result))
        print()
        
        return result # 6. (Optional) Return the function result here
    
    # 1. This function will be called first, then
    return decorated



Step 2. Add @wrapper_function on top of Base Function

In [None]:
@simple_decorator
def sum_all():
    result = 0
    for x in range(100):
        result += x
    return result

@simple_decorator
def mul_all():
    result = 0
    for x in range(100):
        result *= x
    return result

Step 3. Execute

In [None]:
mul_all()
sum_all()

### End of Decorator Guide

Let's do it for plotting common words

In [None]:
def plot_common_words(func):
    
    def decorated(corpus, name="dataset"):
        # Get word_list from decorated function
        word_list = func(corpus)
        
        # Count words inside the list
        word_counts = Counter(word_list)
    
        # Get top 10 most frequent word
        top_10_words = word_counts.most_common(10)
        
        # Plot the result
        plt.figure(figsize=(8, 5))
        plt.suptitle("{} in {}".format(func.__name__, name))
        
        x, y = zip(*top_10_words)
        
        labels = list(x)
        bplot = sns.barplot(x=labels, y=y)
        bplot.set_xticklabels(labels=labels, rotation=30)
        
        # Return nothing here, because we just want the plot
        return
    
    return decorated


@plot_common_words
def common_words(corpus):
    word_list = [word for word in corpus 
                      if word not in stop and word.isalpha()]
    return word_list

@plot_common_words
def common_puncts(corpus):
    punc_regex = r"[{}]+".format(string.punctuation)
    word_list = [word for word in corpus if re.fullmatch(punc_regex, word)]
    return word_list

@plot_common_words
def common_nonalpha(corpus):
    punc_regex = r"[{}]+".format(string.punctuation)
    word_list = [word for word in corpus 
                      if not re.fullmatch(punc_regex, word) and 
                         not word.isalpha() and
                         not word.isdigit()]
    return word_list

@plot_common_words
def common_stops(corpus):
    word_list = [word for word in corpus if word in stop]
    return word_list

In [None]:
common_words(class_0_corpus, name="Class 0 Train")
common_words(class_1_corpus, name="Class 1 Train")

In [None]:
common_puncts(class_0_corpus, name="Class 0 Train")
common_puncts(class_1_corpus, name="Class 1 Train")

In [None]:
common_nonalpha(class_0_corpus, name="Class 0 Train")
common_nonalpha(class_1_corpus, name="Class 1 Train")

In [None]:
common_stops(class_0_corpus, name="Class 0 Train")
common_stops(class_1_corpus, name="Class 1 Train")

# Noise Removal

### Common Noises
Remove common noises found on the above EDA

In [None]:
noise_patterns = { 
        "url"            : r"https?(://\S+|\S+)|www\.\S+",
        "html_tag"       : r"<.*?>",          
        "non_ascii"      : r"[^\x00-\x7f]+",
        "RT word"        : r"\b[Rr][Tt]\b",  # RT commonly appear in retweeted tweet
        "amp word"       : r"\bamp\b",       # what is "amp" ? why this is so common in tweets
}


def noise_check(df_map, patterns):
    
    for key, df in df_map.items():
        print("{} DataFrame noises: ".format(key))
        print("-" * 30)
        for indicator, pattern in patterns.items():
            count = df["text"].str.contains(pattern).sum()
            print("There were {:5} rows with {}".format(count, indicator))
        print()
    

noise_check(df_map, noise_patterns)

In [None]:
# Insert pattern into parentheses
def add_paren(text):
    return "({})".format(text)

# Every noise pattern will be inserted into parentheses, then
# join all noise patterns with '|' 
noises_all = "|".join(add_paren(pattern) for pattern in noise_patterns.values())

# Result is in format: (pattern1)|(pattern2)|(pattern3)|(pattern4)
print(noises_all)

In [None]:
# Clear all noises simultaneously
def clean_noise(text):
    return re.sub(noises_all, r" ", text)

train_tweet["text"] = train_tweet["text"].apply(clean_noise)
test_tweet["text"] = test_tweet["text"].apply(clean_noise)

noise_check(df_map, noise_patterns)

# Unique Pattern Investigation
To handle these unique patterns, we need to identify what the patterns are. This still in test, and we can skip it.

In [None]:
def random_sampling(df_map, n=5):
    for key, df in df_map.items():
        print("{} sample from {} dataset".format(n, key))
        print("-" * 30)
        print(df["text"].sample(n))
        print()
        
random_sampling(df_map)

# Text Normalization

### Lowercase all word
By using lower(), we have normalized the text to lowercase so that the distinction between The and the is ignored.

reference: https://www.nltk.org/book/ch03.html

In [None]:
word1 = "The"
word2 = "the"

print(word1.lower() == word2)

### Unravel Apostrophe Words

reference: https://stackoverflow.com/questions/43018030/replace-apostrophe-short-words-in-python

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# testing
print( decontracted("I'm Lord of Darkness") ) 
print( decontracted("Who've made this burger?") )

### Remove Punctuations
I didn't place this section in noise removal because I need to unravel the apostrophe words before begin removing all punctuations. In this function I replace **",@#"** with empty space and replace the rest with whitespace

In [None]:
def punct2espace(text): # comma to empty space
    return re.sub(",", "", text)

def punct2wspace(text): # punc to white space
    return re.sub(r"[{}]+".format(string.punctuation), " ", text)

def residual_punc(text): # remove remaining bacward slash
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

def normalize_wspace(text): # normalize multiple whitespace
    return re.sub(r"\s+", " ", text)

def replace_punctuations(text):
    text = punct2espace(text)
    text = punct2wspace(text)
    text = residual_punc(text)
    text = normalize_wspace(text)
    
    return text.strip()


replace_punctuations(r"@@rakka@@ alhazimi@@hai typhoon--devastation \\\\\conclusively")

### Correct Mispelling

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker


spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "speling correctin"
correct_spellings(text)

### Lemmatization
Text lemmatization is the process of eliminating redundant prefix or suffix of a word and extract the base word (lemma).

reference:

https://medium.com/text-classification-algorithms/text-classification-algorithms-a-survey-a215b7ab7e2d
https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing#Removal-of-Frequent-words

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

lemmatize_words("rakka alhazimi are cool")

### Wrap all function into one.

In [None]:
def normalize_text(text):
    text = text.lower()                                         # lowercase
    text = decontracted(text)                                   # unravel apostrophe words
    text = replace_punctuations(text)                           # remove punctuations
    text = lemmatize_words(text)                                # lemmatize word 
    
    return text

normalize_text("Hello andrew, I'm from kuvukiland nuce to meet you")

In [None]:
!pip install pandarallel # Use pandarallel for fast apply

In [None]:
from pandarallel import pandarallel

pandarallel.initialize()

start = time.time()
train_tweet["text"] = train_tweet["text"].parallel_apply(normalize_text)
test_tweet["text"] = test_tweet["text"].parallel_apply(normalize_text)

elapsed = time.time() - start
print("Elapsed time: {:.4f} min".format(elapsed / 60))

### Save to CSV

In [None]:
train_tweet.to_csv("train_tweet.csv")
test_tweet.to_csv("test_tweet.csv")

In [None]:
train_tweet[["text", "target"]].head(10)

# How many Unknown Words?

In [None]:
english_vocab = nltk.corpus.words.words("en")
english_vocab = set(english_vocab)

corpus_0, corpus_1 = create_corpus(train_tweet)
corpus = corpus_0 + corpus_1

tweet_words = set(corpus)

In [None]:
unknown_words = list(tweet_words.difference(english_vocab))
print("There were {} unknown words. ".format(len(unknown_words)))

# Text Representation

### Convert text into array of integer

In [None]:
MAX_LEN = 50  # max sentence length

def tokenize(df_map):
    sentence_list = []
    for key, df in df_map.items():
        
        # From sentence -> filter(word) -> sentence
        # What we filter is : digits
        sentence = [" ".join(word for word in sen.split() if not word.isdigit()) # <- here's the digits
                                  for sen in df["text"]]
        
        length = df.shape[0] # Store test tweet length for splitting                    
        
        sentence_list += sentence
        
    # Index all train and test tweets words
    tokenizer = preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(sentence_list)
    
    # Convert text into sequences of integer
    tensor = tokenizer.texts_to_sequences(sentence_list)
    tensor = preprocessing.sequence.pad_sequences(tensor, padding="post", maxlen=MAX_LEN)
    
    # Split train and test tweets
    input_tensor_train, input_tensor_test = tensor[:-length], tensor[-length:]
    
    return input_tensor_train, input_tensor_test, tokenizer


# Input tensor train/test and tokenizer
X_train, X_test, tokenizer = tokenize(df_map)

# Target tensor train
y_train = train_tweet["target"].values
y_train = y_train.reshape(-1, 1)

# Word Embedding

### Fasttext

In [None]:
# Fasttext
with open("../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec", "r") as vector:
    fasttext = vector.readlines()
    
len(fasttext)

### GloVe

In [None]:
# GloVe
with open("../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt", "r") as vector:
    glove = vector.readlines()
    
len(glove)

### Transform Embedding into Dict

In [None]:
def embed2dict(tokenizer, embedding):
    word_index = tokenizer.word_index
    embedding_dict = {line.split()[0]: line.split()[1:] for line in embedding[1:]
                                                        if word_index.get(line.split()[0])}
    return embedding_dict
    
fasttext_dict = embed2dict(tokenizer, fasttext)
glove_dict = embed2dict(tokenizer, glove)

### Similar Words between Two Words Embedding
Words vector in both fasttext and glove is sure not similar. We need to find the intersect using python set() data structure.

In [None]:
fasttext_word = set(fasttext_dict.keys())
glove_word = set(glove_dict.keys())

intersection_word = fasttext_word.intersection(glove_word)

print("Similar embedding words:", len(intersection_word))

Let's update the dictionaies

In [None]:
# update glove_dict
def update_embed_dict(embed_dict):
    result = {word: vector for word, vector in embed_dict.items()
                           if word in intersection_word}
    return result

fasttext_dict = update_embed_dict(fasttext_dict)
glove_dict = update_embed_dict(glove_dict)

### Change Embedding Dict into Vector

In [None]:
fasttext_dim = 300
glove_dim = 200

num_words = len(tokenizer.word_index) + 1 # word_index starts at 1

def dict2vector(tokenizer, embed_dict, embed_dim):
    
    vector = np.zeros(shape=(num_words, embed_dim))
    word_index = tokenizer.word_index
    
    for word, index in word_index.items():
        if index > num_words:
            continue
        
        if embed_dict.get(word):
            vector[index] = embed_dict.get(word)
    
    return vector
    
fasttext_vector = dict2vector(tokenizer, fasttext_dict, fasttext_dim)
glove_vector = dict2vector(tokenizer, glove_dict, glove_dim)

### Concatenate Two Embedding Vectors into One

In [None]:
def concat_vector(vectors):
    # result shape (num_words, fasttext_dim + glove_dim)
    result = np.hstack(vectors)
    return result

embedding_vector = concat_vector([fasttext_vector, glove_vector])

In [None]:
embedding_vector.shape

# Define Parameter

In [None]:
BATCH_SIZE = 32
EPOCHS = 8

embedding_dim = embedding_vector.shape[1]                                               

# Build NN Model

The model architecture is Bidirectional LSTM with MaxPooling2D.
1. Embeddings
2. BLSTM
3. Reshape into Image dim like
4. Conv2D
5. GlobalMaxPooling2D
6. Dropout
7. Dense

reference: https://arxiv.org/abs/1611.06639

In [None]:
def build_model(emb_init=None, emb_train=True):
    
    embed_params = {"input_dim": num_words, 
                    "output_dim": embedding_dim,
                    "embeddings_initializer": keras.initializers.Constant(
                        embedding_vector if emb_init is None else emb_init),
                    "trainable": emb_train,
                    "mask_zero": True, # ignore "zero" paddings
                   }
    
    model = keras.Sequential([
                keras.layers.Embedding(**embed_params),

                keras.layers.Bidirectional(keras.layers.LSTM(128, 
                                                             dropout=0.3,
                                                             recurrent_dropout=0.3,
                                                             return_sequences=True
                                                            )),
                # Reshape size (x, y, hidden) where x * y = MAX_LEN
                keras.layers.Reshape((5, 10, 256)),
                keras.layers.Conv2D(32, 2),
                keras.layers.GlobalMaxPooling2D(),
                keras.layers.Dropout(0.3),
                keras.layers.Dense(1, activation="sigmoid")])

    optimizer = keras.optimizers.Adam(1e-4, clipvalue=0.5)

    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["acc"])

    return model

# Train OOV Words Embedding

In [None]:
test_model = build_model(emb_train=True)
test_model.summary()

In [None]:
test_model.fit(X_train, y_train,
              epochs=10,
              batch_size=BATCH_SIZE,
              validation_split=0.2,)

In [None]:
oov_embedding = test_model.get_weights()[0]
oov_embedding

# Insert OOV vector into Original Embeddings

In [None]:
def merge_embeddings(oov_embed, embed_vector):
    new_vector = embed_vector.copy()
    
    for index, row in enumerate(embed_vector):
        if row.sum() == 0:
            new_vector[index] = oov_embed[index]
    
    return new_vector

new_embedding = merge_embeddings(oov_embedding, embedding_vector)

# Kfold 5

In [None]:
n_splits = 5
splits = list(KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X_train,y_train))
model_record = {}
predictions = np.zeros((X_test.shape[0], 1))

for n, fold in enumerate(splits):
    
    # Current Fold Status
    print()
    print("Fold {}".format(n + 1))
    
    # Define Model Name
    model_fold = "lstm_fold0{}.h5".format(n + 1)
    
    # Callback List
    model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=model_fold,
                                                       monitor="val_acc",
                                                       save_best_only=True,)
    callbacks_list = [model_checkpoint]
    
    # Split data into train and val using fold index
    X_train_fold, y_train_fold = X_train[fold[0]], y_train[fold[0]]
    X_val_fold, y_val_fold = X_train[fold[1]], y_train[fold[1]]
    
    # Build model
    model = build_model(emb_init=new_embedding, emb_train=False) # Embedding is not trainable
    
    history = model.fit(X_train_fold, y_train_fold,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          validation_data=(X_val_fold, y_val_fold),
                          callbacks=callbacks_list,)

    # Record model with best val_accuracy    
    model_record[model_fold] = max(history.history["val_acc"])
    
    # Use the best model
    model = keras.models.load_model(model_fold)
    
    predictions += model.predict(X_test)

predictions /= n_splits

# Submission

In [None]:
model_record

In [None]:
top_acc = max(model_record.keys(), key=model_record.get)
top_acc

In [None]:
# class_prediction = np.where(predictions > 0.5, 1, 0)
best_model = keras.models.load_model(top_acc)
class_prediction = best_model.predict_classes(X_test)
submission = pd.DataFrame({"id": test_tweet["id"], 
                           "target": class_prediction.flatten()})

submission.to_csv("submission.csv", index=False)

In [None]:
submission