# Natural Language Processing with Disaster Tweets 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import spaCy and load the language library. Remember to use a larger model!

import spacy
nlp =spacy.load('en_core_web_lg')

In [None]:


import numpy as np
import pandas as pd

import nltk
import re
import string

In [None]:
train_disaster =pd.read_csv("/kaggle/input/nlp-getting-started/train.csv" ,encoding='unicode_escape',error_bad_lines=False)
train_disaster

In [None]:
train_disaster['target'].value_counts()

In [None]:
train_disaster['target'].hist()

We can see that  class 1 is almost 50 percent of class 0 hence we shall not proceed with class balancing

# 1. Data cleaning on text data

In [None]:
# check for null values
train_disaster.isna().sum()

In [None]:
# define a function to clean the text data
# use re. sub() function which is used to replace occurrences of a particular sub-string with another sub-string.

def text_cleaning(text):
    text =text.lower()                     # make in lower case
    text = re.sub('\[.*?@\]','',text)      # remove text in square brackets
    text =re.sub('\n' ,'',text)
    text = re.sub('\w*\d\w*','' ,text)      # remove words containing numbers
    text.lstrip("$")                        # removes $ sign from start of string   
    text.strip()
    text =re.sub('[!@#$]','',text)          # replace given characters from string
    return text
    

In [None]:
train_disaster['text'] = train_disaster['text'].apply( lambda x:text_cleaning(x))

In [None]:
train_disaster

# Tokenization

In [None]:
def tokenization(text):
    tokens= re.split('W+',text)
    return tokens

train_disaster['tokenized_text'] =train_disaster['text'].apply(lambda x : tokenization(x))

# Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
porter =PorterStemmer()

In [None]:
def stemming(text):
    stemtext= [porter.stem(i) for i in text]
    return stemtext

train_disaster['stemmed_text'] =train_disaster['tokenized_text'].apply(lambda x : stemming(x))

# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemma =WordNetLemmatizer()

In [None]:
def lemmatization(text):
    lem_text = [lemma.lemmatize(i) for i in text]
    return lem_text

train_disaster['lemmatized_text'] =train_disaster['tokenized_text'].apply(lambda x : lemmatization(x))

In [None]:
train_disaster

# Stp words removal

In [None]:
stopwords =(nlp.Defaults.stop_words)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords[0:300]=['whereupon', 'n‘t', 'whoever', 'ca', 'serious', 'seemed', 'been', 'few', 'which', 'there', 'myself', 'part', 'seeming', 'indeed', 'call', 'another', 'namely', 'show', 'used', 'for', 'sometime', 'wherever', 'bottom', 'ever', 'fifteen', 'ten', 'top', 'done', 'noone', 'not', 'yourself', 'beyond', 'afterwards', 'move', 'more', 'most', 'therein', 'back', "'ve", 'my', 'himself', '‘ll', 'any', 'perhaps', 'something', 'last', 'until', 'anyhow', 'nobody', 'our', 'hereby', 're', 'hers', 'does', 'put', 'every', 'into', 'such', 'they', 'everywhere', 'one', 'always', 'has', 'full', 'anyway', 'third', 'us', 'it', 'towards', 'almost', 'on', 'out', 'her', 'as', 'might', 'same', 'your', 'me', 'hundred', 'together', 'the', 'already', 'an', 'eight', 'mostly', 'have', 'further', 'only', 'using', 'what', 'whereas', 'though', 'name', 'being', 'became', 'regarding', 'side', 'moreover', 'under', 'did', 'whether', 'amongst', 'that', 'whence', 'when', 'we', 'empty', 'well', 'herself', 'eleven', 'whither', 'say', 'him', 'even', 'off', 'against', 'give', 'below', 'beforehand', 'really', "'ll", 'itself', 'made', 'thus', 'toward', 'his', '‘d', 'you', 'get', 'whole', 'a', 'would', 'ours', 'becomes', 'nevertheless', 'many', 'unless', 'throughout', 'either', 'over', 'these', 'and', 'so', 'them', '’ll', 'those', 'since', 'somehow', '’re', 'alone', 'neither', 'without', 'forty', 'cannot', 'make', 'he', 'twelve', 'front', 'in', 'none', 'down', 'after', 'was', 'thereupon', 'keep', 'around', 'go', 'however', 'no', 'becoming', 'yourselves', 'else', 'just', 'between', 'yet', 'whereby', '’m', 'others', 'who', 'former', 'had', 'amount', 'among', 'everyone', 'herein', 'two', 'nor', 'other', 'could', 'thereafter', 'still', 'thereby', 'anyone', 'because', 'before', 'rather', 'will', 'hereafter', 'latterly', '‘m', 'how', 'may', 'three', 'across', 'do', "'m", 'become', 'whom', 'up', 'along', 'each', 'due', 'sometimes', 'anything', 'within', 'is', 'several', 'should', 'latter', 'themselves', 'are', 'by', 'whereafter', 'she', 'someone', 'nothing', 'nowhere', 'behind', 'or', 'too', 'twenty', 'wherein', 'be', 'except', 'once', 'enough', 'besides', 'first', 'am', "'s", 'quite', 'anywhere', 'from', 'can', 'about', 'onto', '’s', 'this', 'then', 'than', 'all', 'ourselves', 'at', 'while', 'also', '‘re', 'if', 'five', 'upon', 'yours', 'least', 'very', 'although', 'where', 'less', 'above', 'nine', 'much', '’d', 'hence', 'of', '‘ve', 'whose', '’ve', 'meanwhile', 'see', 'doing', 'per', 'elsewhere', 'their', 'mine', 'whatever', 'via', 'to', 'were', 'some', 'thence', 'various', '‘s', 'here', 'why', 'please', 'thru', 'through', 'seems', 'take', 'again', 'during', 'seem', 'six', "n't", 'formerly', 'sixty', "'re", 'four', 'n’t', 'but', 'everything', 'whenever', "'d", 'often', 'never', 'with', 'next', 'hereupon', 'otherwise', 'i', 'somewhere', 'both', 'beside', 'fifty', 'therefore', 'its', 'now', 'own', 'must']

In [None]:
def stopword_removal(text):
    removed_stopwords=[ i for i in text if i not in stopwords]
    return removed_stopwords
    
train_disaster['final_cleaned_text'] =train_disaster['lemmatized_text'].apply(lambda x : stopword_removal(x))

In [None]:
train_disaster

So our final cleaned text is train_disaster['final_cleaned_text']

# 2. MODELLING

# Using different machine language algorithm we will try to predict targets

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
x=train_disaster['final_cleaned_text']
y=train_disaster['target']

In [None]:
# since stock['final_cleaned_text'] is in array of array , we need to convert into array of strings
#Join all items in a tuple into a string

train_disaster['final_cleaned_text']=[" ".join(i) for i in train_disaster['final_cleaned_text'].values]

In [None]:
# We will further divide our train data into train and validation data

x_train ,x_val ,y_train,y_val =train_test_split(x,y,test_size=0.25 ,random_state=40 )

In [None]:
x_train

In [None]:
x_val

# 2.a)  Use linear support vector machine along with pipeline

The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings(Inverse Document Frequency (IDF) is a weight indicating how commonly a word is used.The more frequent its usage across documents, the lower its score), and allow you to encode new documents.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,classification_report

In [None]:
pipeline1 = Pipeline([('Tfidf',TfidfVectorizer()),
                   ('Svm',LinearSVC())])

pipeline1

In [None]:
pipeline1.fit(x_train,y_train)

In [None]:
predict1 =pipeline1.predict(x_val)


In [None]:
print(accuracy_score(y_val,predict1))

In [None]:
accuracy2 =accuracy_score(y_val,predict1)

In [None]:
print(classification_report(y_val,predict1))

# 2.b) Using Logistic regression and vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
pipeline2 = Pipeline([('Tfidf',TfidfVectorizer()),
                   ('logisticregression',LogisticRegression(penalty ='l2',solver ='saga'))])

pipeline2

In [None]:
pipeline2.fit(x_train,y_train)

In [None]:
predict2 =pipeline2.predict(x_val)
predict2

In [None]:
print(accuracy_score(y_val,predict2))

In [None]:
accuracy3 =accuracy_score(y_val,predict2)

In [None]:
print(classification_report(y_val,predict2))

# 2.c) Using Naive Bayes and vectorization

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
pipeline3 = Pipeline([('Tfidf',TfidfVectorizer()),
                   ('naivebayes',MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None))])

pipeline3

In [None]:
pipeline3.fit(x_train,y_train)

In [None]:
predict3 =pipeline3.predict(x_val)
predict3

In [None]:
print(accuracy_score(y_val,predict3))

In [None]:
accuracy4= accuracy_score(y_val,predict3)

In [None]:
print(classification_report(y_val,predict3))

# 2.d) Using Stochastic Gradiant Descent and vectorization

In [None]:
from sklearn.linear_model import SGDClassifier


In [None]:
pipeline4 = Pipeline([('Tfidf',TfidfVectorizer()),
                   ('SGD',SGDClassifier(loss = 'hinge', penalty = 'l2', random_state=0))])

pipeline4

In [None]:
pipeline4.fit(x_train,y_train)

In [None]:
predict4 =pipeline4.predict(x_val)
predict4

In [None]:
print(accuracy_score(y_val,predict4))

In [None]:
accuracy5= accuracy_score(y_val,predict4)

In [None]:
print(classification_report(y_val,predict4))

# 2.e) Using Random Forest classifier and vectorization

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipeline5 = Pipeline([('Tfidf',TfidfVectorizer()),
                   ('RFC',RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None,min_samples_split=4))])

pipeline5

In [None]:
fit =pipeline5.fit(x_train,y_train)

In [None]:
predict5 =pipeline5.predict(x_val)
predict5

In [None]:
print(accuracy_score(y_val,predict5))

In [None]:
accuracy6= accuracy_score(y_val,predict5)

In [None]:
print(classification_report(y_val,predict5))

# Compare performances of all models

In [None]:
all_accuracies =[accuracy2,accuracy3 ,accuracy4,accuracy5,accuracy6]
models =['LinearSVM','Logistic Regression','Naive Bayes','SGD classifier','RandomForestClassifier']

df =pd.DataFrame( {'Model':models ,'Accuracy': all_accuracies })
df

# We can see that Naive Bayes model gave best accuracy most of the time whenever we run all models  hence we will proceed with that model

# Perform on given test data with Naive Bayes's model

In [None]:
test_disaster =pd.read_csv("/kaggle/input/nlp-getting-started/test.csv" ,encoding='unicode_escape',error_bad_lines=False)
test_disaster

In [None]:
xtest = test_disaster['text']
xtest

In [None]:
# pipeline3 is the model for Naive Bayes

predict_test =pipeline3.predict(xtest)  

Attach that predicted vale to test data set

In [None]:
test_disaster['target'] =predict_test

test_disaster

In [None]:
final_test_disaster =test_disaster[['id','target']]

In [None]:
final_test_disaster

In [None]:
final_test_disaster.to_csv('sample_submission.csv' ,index= False)