# Natural Language Processing with Disaster Sweets
- Hola amigos, this is my Kaggle Notebook for the Kaggle competition Natural Language Processing with Disaster Tweets, which can be found [here](https://www.kaggle.com/c/nlp-getting-started/overview)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Installing & Importing Packages

In [None]:
import re
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.sparse import vstack
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Importing the Dataset

In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_sub = pd.read_csv("../input/nlp-getting-started/sample_submission.csv") 
df_train.info()

In [None]:
df_train.drop_duplicates(subset=["text", "target"], keep="first", inplace=True)
sep = df_train.shape[0]
df_train.info()

In [None]:
Y = df_train["target"]
df_train.drop(["target"], axis=1, inplace=True)
print(df_train.shape, Y.shape)

In [None]:
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")
df_test.info()

In [None]:
df = pd.concat([df_train, df_test], axis=0)
df.drop(["location"], axis=1, inplace=True)
df.info()

In [None]:
df.head()

# Pre-processing the Dataset
- First, we will be performing the **decontraction** of all the contracted words like "won't", "can't", and many others.
- Then, we removed all the words with numbers. Then, we removed all the special characters. Then, we removed all the stopwords
- Finally, we saved all the pre-processed sentences in a list, which we use to over-write the original dataset.

In [None]:
# Decontraction
def decontracted(phrase):
    # Specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # General
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# https://gist.github.com/sebleier/554280
# We are removing the words from the stop words list: 'no', 'nor', 'not' as they generally hold
# a lot of information regarding the meaning of the sentence.

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
    "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
    'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
    'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
    'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
    'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
    's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
    've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
    "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
    "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
    'won', "won't", 'wouldn', "wouldn't"])

In [None]:
# Combining all the above steps
pre_text = []

# tqdm is for printing the status bar
for sen in tqdm(df['text'].values):
    # Decontraction
    sen = decontracted(sen)
    # Remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
    sen = re.sub("\S*\d\S*", "", sen).strip()
    # Remove special characters: https://stackoverflow.com/a/5843547/4084039
    sen = re.sub('[^A-Za-z]+', ' ', sen)
    # Removing all the stopwords
    sen = ' '.join(e.lower() for e in sen.split() if e.lower() not in stopwords)
    # Adding them back to pre_text
    pre_text.append(sen.strip())

In [None]:
print(pre_text[10])
print(pre_text[20])
print(pre_text[30])

# Featurizing the Dataset
- We will be trying Binary Bag of Words (BoW) as the first featurization technique. While performing this featurization, we have set the feature corresponding to key-word as 2, just to differentiate between the keyword and the rest of the words
- This technique along with a Logistic Regression model gave worse results than the sample submission, hence dropping this approach.

## Binary Bag of Words (BoW)

In [None]:
# count_vect = CountVectorizer(max_features=500, binary=False)

# # Featurizing the Train Dataset (fit_transform)
# fea_train = count_vect.fit_transform(pre_text[:sep])
# print(fea_train.shape)

# # Featuring the Test Dataset (transform)
# fea_test = count_vect.transform(pre_text[sep:])
# print(fea_test.shape)

# fea_text = vstack([fea_train, fea_test])
# fea_text.todense()
# print(type(fea_text), fea_text.shape)

In [None]:
# fea_names = count_vect.get_feature_names()
# fea_to_ind = {}
# for ind, fea in enumerate(fea_names):
#     fea_to_ind[fea] = ind
    
# for i in range(df.shape[0]):
#     key_words = df["keyword"].iloc[i]
#     if(not isinstance(key_words, float)):
#         key_words = key_words.split('%20')
#         indices = []
#         for word in key_words:
#             if word in fea_names:
#                 indices.append(fea_to_ind[word])
#         for ind in indices:
#             fea_text[i, ind] = 2

In [None]:
tfidf_vect = TfidfVectorizer(
    max_features=60000, ngram_range=(1,2), min_df=1, norm='l2', sublinear_tf=True
)

# Featurizing the Train Dataset (fit_transform)
fea_train = tfidf_vect.fit_transform(pre_text[:sep])
print(fea_train.shape)

# Featuring the Test Dataset (transform)
fea_test = tfidf_vect.transform(pre_text[sep:])
print(fea_test.shape)

fea_text = vstack([fea_train, fea_test])
fea_text.todense()
print(type(fea_text), fea_text.shape)

# Preparing the dataset for Training

In [None]:
df.drop(["keyword", "text"], axis=1, inplace=True)
fea_text = pd.DataFrame.sparse.from_spmatrix(fea_text)
df_fea = pd.concat([df, fea_text.reindex(df.index)], axis=1)
print(df.shape, fea_text.shape, df_fea.shape)

In [None]:
df_fea.head()

In [None]:
df_train = df_fea.iloc[ :sep, : ]
df_test = df_fea.iloc[sep: , : ]

print(df_train.columns)

id_train = df_train['id']
df_train.drop(['id'], inplace=True, axis=1)
id_test = df_test['id']
df_test.drop(['id'], inplace=True, axis=1)

print(df_train.shape, Y.shape, id_train.shape, df_test.shape, id_test.shape)

# Training the Model

In [None]:
n = 1
skf = StratifiedKFold(n_splits = 4)

for train_index, val_index in skf.split(df_train, Y):
    X_train, X_val = df_train.iloc[train_index], df_train.iloc[val_index]
    y_train, y_val = Y.iloc[train_index], Y.iloc[val_index]
    model = LogisticRegression(max_iter = 1000, C = 3)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    y_val_train = model.predict(X_train)
    print("Accuracy (Train) = ", f1_score(y_train, y_val_train))
    print("F1 Score (Train) = ", f1_score(y_train, y_val_train))
    print("Accuracy (Val) = ", f1_score(y_val, y_val_pred))
    print("F1 Score (Val) = ", f1_score(y_val, y_val_pred))
    
    # Adding a Set of Predictions based off this model
    df_sub[str(n)] = model.predict(df_test)
    n += 1

In [None]:
fre = df_sub[['1', '2', '3', '4']].mode(axis=1)
df_sub['target'] = fre[0]
df_sub = df_sub[['id', 'target']]
df_sub['target'] = df_sub['target'].apply(lambda x : int(x))

In [None]:
df_sub.head()

In [None]:
df_sub.to_csv('submission.csv', index=False)