In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import relevant libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from textblob import TextBlob, Word
import collections
import re
import string
import emoji
import time

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import tensorflow as tf
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

train_df.head()

# 2. Exploratory Data Analysis

In [None]:
train_df.shape

## 2.1 Class distribution

In [None]:
train_df["target"].value_counts()

In [None]:
train_df["target"].value_counts(normalize=True)

In [None]:
countplot = sns.countplot(x="target", data=train_df, palette="Set1")
countplot.set_title("Real disaster tweets count")

In [None]:
my_labels=["Non-Disaster", "Disaster"]
plt.pie(train_df['target'].value_counts(), labels=my_labels, colors = ["Blue","Red"])
plt.legend()
plt.show()

In [None]:
train_df.drop(columns=['id','keyword','location'], axis=1, inplace=True)
test_df.drop(columns=['keyword','location'], axis=1, inplace=True)

## 2.2 Wordclouds

### 2.2.1 Wordcloud for real disaster tweets

In [None]:
ax = plt.figure(figsize=(20,20))
wordcloud = WordCloud(max_words = 500 , width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(train_df[train_df.target == 1].text))
plt.imshow(wordcloud , interpolation = 'bilinear')

### 2.2.2 Wordcloud for non-disaster tweets

In [None]:
ax = plt.figure(figsize=(20,20))
wordcloud = WordCloud(max_words = 500 , width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(train_df[train_df.target == 0].text))
plt.imshow(wordcloud , interpolation = 'bilinear')

As we can see, there are many noisy words that should not be indicative of disaster. Let's clean this up

# 3. Preprocessing

## 3.1 Defining variables

In [None]:
stop_words = set(stopwords.words('english'))
contraction_map = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
}

all_punctuation = set(string.punctuation)
all_punctuation.add("...")
all_punctuation.add('’')
all_punctuation.add('-')
all_punctuation.add('“')
all_punctuation.add('[')
all_punctuation.add(']')
all_punctuation.add(' ')

Let's create some functions so it'll be easier to implement them down the line

## 3.2 Helper functions

### 3.2.1 Lower-casing text

In [None]:
def uncapitalize(text):
    return text.lower()

### 3.2.2 Remove emojis

In [None]:
def removeEmojis(text):
    allchars = [c for c in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["en"]]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

### 3.2.3 Expand abbreviations

In [None]:
def expand_abbr(article):
    new_article = article
    for item in contraction_map:
        if item in article:
            new_article = article.replace(item,contraction_map[item])
    return new_article

### 3.2.4 Remove website urls

In [None]:
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

### 3.2.5 Stripping all entities

In [None]:
def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

### 3.2.6 Lemmantization

In [None]:
def lemmatize_with_postag(text):
    sent = TextBlob(text)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

### 3.2.7 Remove stopwords

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return filtered_sentence

### 3.2.8 Remove punctuations

In [None]:
def remove_punctuation(token_list):
    new_list = []
    for tok in token_list:
        if tok not in all_punctuation:
            new_list.append(tok)
    final_list = [x for x in new_list if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
    final_sentence = " ".join(final_list)
    return final_sentence

## 3.3 Cleaning text

In [None]:
def clean_text(text):
    text = uncapitalize(text)
    text = removeEmojis(text)
    text = expand_abbr(text)
    text = strip_links(text)
    text = strip_all_entities(text)
    text = lemmatize_with_postag(text)
    cleaned_tokens = remove_stopwords(text)
    final_text = remove_punctuation(cleaned_tokens)
    return final_text
processed_train_df = train_df.copy(deep=True)
processed_train_df["text"] = processed_train_df.text.apply(clean_text)

In [None]:
processed_train_df.head()

Looks good! Now let's take a look at more visualizations to easier understand our data

# 4. Further visualizations

## 4.1 Wordclouds 

### 4.1.1 Wordcloud for real disaster tweets

In [None]:
ax = plt.figure(figsize=(20,20))
wordcloud = WordCloud(max_words = 500, width = 1000, height = 500).generate(" ".join(processed_train_df[processed_train_df.target == 1].text))
plt.imshow(wordcloud , interpolation = 'bilinear')

### 4.1.2 Wordcloud for non-disaster tweets

In [None]:
ax = plt.figure(figsize=(20,20))
wordcloud = WordCloud(max_words = 500, width = 1000, height = 500).generate(" ".join(processed_train_df[processed_train_df.target == 0].text))
plt.imshow(wordcloud , interpolation = 'bilinear')

## 4.2 n-grams of real disaster tweets

### 4.2.1 Preprocessing

In [None]:
def extract_ngrams(text, num):
    n_grams = ngrams(nltk.word_tokenize(text), num)
    return [' '.join(grams) for grams in n_grams]

In [None]:
disaster_text = " ".join(processed_train_df[processed_train_df.target == 1].text)

### 4.2.2 Uni-grams (most common words)

In [None]:
real_one_gram = extract_ngrams(disaster_text, 1)
real_one_gram_freq = collections.Counter(real_one_gram)
real_one_gram_freq.most_common(15)

In [None]:
freq_list = real_one_gram_freq.most_common(15)
fig,ax = plt.subplots()

fig = plt.figure(figsize=(10,10))
x = []
y = []
for item in freq_list:
    x.append(item[0])
    y.append(item[1])
    
ax.vlines(x,ymin=8, ymax=y, color="green")
ax.plot(x,y, "o", color="maroon")
ax.set_xticklabels(x, rotation=90)
ax.set_ylabel("count")
ax.set_title("unigram of real disaster tweets")

### 4.2.3 Bi-grams

In [None]:
real_bigram = extract_ngrams(disaster_text, 2)
real_bigram_freq = collections.Counter(real_bigram)
real_bigram_freq.most_common(15)

In [None]:
freq_list = real_bigram_freq.most_common(15)
fig,ax = plt.subplots()

fig = plt.figure(figsize=(10,10))
x = []
y = []
for item in freq_list:
    x.append(item[0])
    y.append(item[1])
    
ax.vlines(x,ymin=8, ymax=y, color="green")
ax.plot(x,y, "o", color="maroon")
ax.set_xticklabels(x, rotation=90)
ax.set_ylabel("count")
ax.set_title("bigram of real disaster tweets")

### 4.2.4 Tri-grams

In [None]:
real_trigram = extract_ngrams(disaster_text, 3)
real_trigram_freq = collections.Counter(real_trigram)
real_trigram_freq.most_common(15)

In [None]:
freq_list = real_trigram_freq.most_common(15)
fig,ax = plt.subplots()

fig = plt.figure(figsize=(10,10))
x = []
y = []
for item in freq_list:
    x.append(item[0])
    y.append(item[1])
    
ax.vlines(x,ymin=8, ymax=y, color="green")
ax.plot(x,y, "o", color="maroon")
ax.set_xticklabels(x, rotation=90)
ax.set_ylabel("count")
ax.set_title("trigram of real disaster tweets")

## 4.3 n-grams of non-disaster tweets

### 4.3.1 Preprocessing

In [None]:
non_disaster_text = " ".join(processed_train_df[processed_train_df.target == 0].text)

### 4.3.2 Uni-grams (most common words)

In [None]:
non_disaster_one_gram = extract_ngrams(non_disaster_text, 1)
non_disaster_one_gram_freq = collections.Counter(non_disaster_one_gram)
non_disaster_one_gram_freq.most_common(15)

In [None]:
freq_list = non_disaster_one_gram_freq.most_common(15)
fig,ax = plt.subplots()

fig = plt.figure(figsize=(10,10))
x = []
y = []
for item in freq_list:
    x.append(item[0])
    y.append(item[1])
    
ax.vlines(x,ymin=8, ymax=y, color="green")
ax.plot(x,y, "o", color="maroon")
ax.set_xticklabels(x, rotation=45)
ax.set_ylabel("count")
ax.set_title("unigram of non-disaster tweets")

### 4.3.3 Bi-grams

In [None]:
non_disaster_bigram = extract_ngrams(non_disaster_text, 2)
non_disaster_bigram_freq = collections.Counter(non_disaster_bigram)
non_disaster_bigram_freq.most_common(15)

In [None]:
freq_list = non_disaster_bigram_freq.most_common(15)
fig,ax = plt.subplots()

fig = plt.figure(figsize=(18,1))
x = []
y = []
for item in freq_list:
    x.append(item[0])
    y.append(item[1])
    
ax.vlines(x,ymin=8, ymax=y, color="green")
ax.plot(x,y, "o", color="maroon")
ax.set_xticklabels(x, rotation=90)
ax.set_ylabel("count")
ax.set_title("bigram of non-disaster tweets")

### 4.3.4 Tri-grams

In [None]:
non_disaster_trigram = extract_ngrams(non_disaster_text, 3)
non_disaster_trigram_freq = collections.Counter(non_disaster_trigram)
non_disaster_trigram_freq.most_common(15)

In [None]:
freq_list = non_disaster_trigram_freq.most_common(15)
fig,ax = plt.subplots()

fig = plt.figure(figsize=(10,10))
x = []
y = []
for item in freq_list:
    x.append(item[0])
    y.append(item[1])
    
ax.vlines(x,ymin=8, ymax=y, color="green")
ax.plot(x,y, "o", color="maroon")
ax.set_xticklabels(x, rotation=90)
ax.set_ylabel("count")
ax.set_title("trigram of non-disaster tweets")

# 5. Importing word embeddings (GloVe)

In [None]:
embedding_df = processed_train_df.copy(deep=True)

In [None]:
word_embeddings = {}
f = open('/kaggle/input/glove6b/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs

# 6. Converting sentences into vectors

In [None]:
def get_sentence_vectors(text):
    sentence_vector = np.zeros((100,))
    if len(text) == 0:
        return sentence_vector
    else:
        tokens = text.split()
        for token in tokens:
            try:
                sentence_vector += word_embeddings[token]
            except:
                pass
        sentence_vector = sentence_vector/len(tokens)
        return sentence_vector

In [None]:
embedding_df["text"] = embedding_df.text.apply(get_sentence_vectors)

In [None]:
embedding_df.head()

# 7. Machine learning models

## 7.1 Train test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(embedding_df["text"],embedding_df["target"],test_size=0.2)

## 7.2 Naive Bayes

In [None]:
print(f"**********Naive Bayes**********")
model =  GaussianNB()
start = time.time()
model.fit(x_train.to_list(),y_train)
y_pred = model.predict(x_test.to_list())
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

## 7.3 Logistic Regression

In [None]:
print(f"**********Logistic Regression**********")
model = LogisticRegression()
start = time.time()
model.fit(x_train.to_list(),y_train)
y_pred = model.predict(x_test.to_list())
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

## 7.4 Random Forest

In [None]:
print(f"**********Random Forest**********")
model = RandomForestClassifier()
start = time.time()
model.fit(x_train.to_list(),y_train)
y_pred = model.predict(x_test.to_list())
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

## 7.5 AdaBoost

In [None]:
print(f"**********AdaBoost**********")
model = AdaBoostClassifier()
start = time.time()
model.fit(x_train.to_list(),y_train)
y_pred = model.predict(x_test.to_list())
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

## 7.6 XGBoost

In [None]:
print(f"**********XGBoost**********")
model = XGBClassifier()
start = time.time()
model.fit(np.asarray(x_train.to_list()),y_train)
y_pred = model.predict(np.asarray(x_test.to_list()))
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

## 7.7 Support Vector Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
print(f"**********SVC**********")
model = SVC()
start = time.time()
model.fit(x_train.to_list(),y_train)
y_pred = model.predict(x_test.to_list())
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

## 7.8 Stacking

In [None]:
print("**********Stacking**********")
start = time.time()
estimators = [("xgb", XGBClassifier()), ("SVC",SVC()), ("rfe",RandomForestClassifier())]
final_estimator = LinearSVC()
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
stacking_clf.fit(np.asarray(x_train.to_list()),y_train)
y_pred = stacking_clf.predict(np.asarray(x_test.to_list()))
f1score = f1_score(y_test,y_pred)
accuracyscore = accuracy_score(y_test,y_pred)
precisionscore = precision_score(y_test,y_pred)
recallscore = recall_score(y_test,y_pred)
print(f"f1_score: {f1score}")
print(f"Accuracy: {accuracyscore}")
print(f"Precision: {precisionscore}")
print(f"Recall: {recallscore}")
print('Time Taken :' + str(round(start - time.time(),2) * -1))
print("\n")

# 8. Artificial Neural Networks (ANNs)

## 8.1 Stop function

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch,logs={}):
    if(logs.get('accuracy')>0.90):
      print("\nReached 90% accuracy so cancelling training")
      self.model.stop_training=True

## 8.2 Creating model

In [None]:
callbacks = myCallback()
ann_model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
ann_model.fit(np.asarray(x_train.to_list()), y_train, epochs=1000, validation_data=(np.asarray(x_test.to_list()), y_test), callbacks=[callbacks])


## 8.3 Evaluate ANN

In [None]:
ann_model.evaluate(np.asarray(x_test.to_list()),y_test)

# 9. Preperation for LSTM

## 9.1 Create word frequency

In [None]:
processed_train_df.head(10)

In [None]:
def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

In [None]:
counter = counter_word(processed_train_df.text)

In [None]:
num_words = len(counter)
#Max number of words in a sequence
max_length = 30

## 9.2 Create train test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(processed_train_df["text"],processed_train_df["target"],test_size=0.3)

## 9.3 Word index

In [None]:
tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(x_train)

In [None]:
word_index = tokenizer.word_index

## 9.4 Create word sequence

In [None]:
train_sequences = tokenizer.texts_to_sequences(x_train)
train_sequences[:5]

## 9.5 Text padding

In [None]:
train_padded = pad_sequences(
    train_sequences, maxlen= max_length, padding='post', truncating = 'post'
)

In [None]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences[:5]

In [None]:
test_padded = pad_sequences(
    test_sequences, maxlen= max_length, padding='post', truncating = 'post'
)

In [None]:
print(x_train.head(1))
print(train_sequences[0])

# 10. Building LSTM model

## 10.1 Stop function

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch,logs={}):
    if(logs.get('accuracy')>=0.85):
      print("\nReached 85% accuracy so cancelling training")
      self.model.stop_training=True
callbacks = myCallback()

## 10.2 Creating model

In [None]:
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_words, 32, input_length=max_length),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.LSTM(200),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
history = lstm_model.fit(
    train_padded,
    y_train,
    epochs=30,
    verbose=1,
    validation_data=(test_padded, y_test),
    callbacks=[callbacks]
)

## 10.3 Evaluate LSTM

In [None]:
lstm_model.evaluate(test_padded,y_test)

# 11. Output

## 11.1 Preprocess test dataset

In [None]:
def clean_text(text):
    text = uncapitalize(text)
    text = removeEmojis(text)
    text = expand_abbr(text)
    text = strip_links(text)
    text = strip_all_entities(text)
    text = lemmatize_with_postag(text)
    cleaned_tokens = remove_stopwords(text)
    final_text = remove_punctuation(cleaned_tokens)
    return final_text

processed_test_df = test_df.copy(deep=True)
processed_test_df["text"] = processed_test_df.text.apply(clean_text)

final_train_df = train_df.copy(deep=True)
final_train_df["text"] = final_train_df.text.apply(clean_text)

## 11.2 Convert text to vectors

In [None]:
test_embedding_df = processed_test_df.copy(deep=True)
test_embedding_df["text"] = test_embedding_df.text.apply(get_sentence_vectors)

final_train_embedding = final_train_df.copy(deep=True)
final_train_embedding["text"] = final_train_embedding.text.apply(get_sentence_vectors)

In [None]:
final_train_embedding.head()

## 11.3 Output predictions

In [None]:
estimators = [("xgb", XGBClassifier()), ("SVC",SVC()), ("rfe",RandomForestClassifier())]
final_estimator = LinearSVC()
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
stacking_clf.fit(np.asarray(final_train_embedding["text"].to_list()),final_train_embedding["target"])
predictions = stacking_clf.predict(np.asarray(test_embedding_df["text"].to_list()))

In [None]:
predictions_df = pd.Series(np.array(predictions).flatten()).to_frame()
result_df = pd.concat([test_df,predictions_df], axis = 1)
result_df.drop(columns=['text'], axis=1, inplace=True)
result_df = result_df.rename(columns={0: "target"})
result_df['target'] = result_df['target'].map(lambda a: int(a))

In [None]:
result_df.head()

In [None]:
result_df.to_csv('result.csv',index=False)