In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Importing and Exploring Data 
* ## Exploration is done on train.csv, but preprocessing based on training exploration is done on both training and testing data simultaneously

In [2]:
unclean_train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
unclean_test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
unclean_train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [3]:
# No. of rows in the dataset
len(unclean_train)

7613

In [4]:
#Some NULL objects present in 'keyword' and 'location', but not in 'id', 'text', and 'target'
unclean_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
#Number of values for each of class 0 and calss 1
unclean_train['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [6]:
unclean_train['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [7]:
unclean_train['keyword'].isnull().sum()

61

In [8]:
#Dropping features not deemed important 
unclean_train.drop(['keyword','location'],axis=1,inplace=True)
unclean_test.drop(['keyword','location'],axis=1,inplace=True)

unclean_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
#Duplicates are removes from training dataset to increase efficiency and reduce redundancy
unclean_train.drop_duplicates('text',keep='first',inplace=True)

#This is NOT to be done on test dataset.

unclean_train['target'].value_counts()

target
0    4305
1    3198
Name: count, dtype: int64

In [10]:
unclean_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
#id column is not needed for training, but required in submission.csv, so we store is safely away
ids_train=unclean_train['id']
ids_test=unclean_test['id']
ids_train

0           1
1           4
2           5
3           6
4           7
        ...  
7604    10863
7605    10864
7606    10866
7608    10869
7612    10873
Name: id, Length: 7503, dtype: int64

In [12]:
inter_train=unclean_train.drop('id',axis=1)
inter_test=unclean_test.drop('id',axis=1)
inter_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# Data Cleaning
* ## Powerful tools of regex and nltk are used to clean text

In [13]:
import re   #regex
import string
def clean_text_round1(text): #Sq. brackets, punctiuations, quotation marks etc, which don't add meaning to core of text is removed
    
    text = str(text).lower() #All UPPERCASE alphabets reduces to lowercase
#   text = re.sub('\[.*?\]', '', text) #removes text in sq. brackets, incl. the brackets; not needed here
    text = re.sub(r'\[|\]', '', text) #removes only square brackets
    text = re.sub('<.*?>+', '', text) #removes HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #removes punctuation
    text = re.sub('\d', '', text) #removes digits
    text = re.sub('[“”…]', '', text) #removes double quotation marks; single quotations not removed to facilitate contraction of words such as can't to cannot, don't to do not, etc...
    text = re.sub('\n', '', text) #removes newlines
    
    text = re.sub(r'https?\S+|www\.\S+', '', text) #removes links
    text = re.sub('\s+', ' ', text) #removes extra white spaces
    text = re.sub('[^A-Za-z0-9\s]', '', text) #removes misc. symbols
#     print(text)
    return text
    

In [14]:
# facilitate contraction of words such as can't to cannot, don't to do not, etc...
!pip install contractions
from contractions import fix 
def contract(text):
    text=fix(str(text))
    return text

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [15]:
### LEMMATISATION VIA NLTK IS NOT WORKING FOR SOME REASON... PLEASE FEEL FREE TO DEBUG AND LET ME KNOW TOO! ###


# import nltk
# from nltk.corpus import wordnet

# # Download WordNet (if not already downloaded)
# nltk.download('wordnet')
# # nltk.download('corpora/wordnet')

# def lemma(text):
#     # Tokenize the text
#     # text = "This is a sentence with words to lemmatize."
#     tokens = nltk.word_tokenize(text)

#     # Initialize the WordNetLemmatizer
#     lemmatizer = wordnet.WordNetLemmatizer()

#     # Lemmatize each token
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

#     # Join the lemmatized tokens back into a string
#     lemmatized_text = " ".join(lemmatized_tokens)

#     return lemmatized_text  # Output: "This is a sentence with words to lemma"


In [16]:
### LEMMATISATION ###

import spacy

# Load the spaCy English model (if not already installed)
nlp = spacy.load("en_core_web_sm")

def lemm(text):
    # Process the text
    doc = nlp(text)

    # Get the lemmas of each token
    lemmatized_tokens = [token.lemma_ for token in doc]

    # Join the lemmatized tokens back into a string
    lemmatized_text = " ".join(lemmatized_tokens)

    return lemmatized_text


In [17]:
## STOPWORDS do not add much meaning to text, hence are removed
from nltk.corpus import stopwords
def clean_text_round2(text):
    return ' '.join([word for word in text.split() if word not in stopwords.words('english')])

In [18]:
### I TRIED TO CONSOLIDATE ALL CLEANING FUNCTIONS INTO A SINGLE FUNCTION, BUT THIS SNIPPET IS NOT WORKING. FEEL FREE TO DEBUG AND LET ME KNOW TOO!! ###

# def cleaning(text):
#     clean_text_round1(text)
#     contract(text)
#     lemm(text)
#     clean_text_round2(text)
#     text = re.sub('[‘’]', '', text)
#     return text

* ## The above functions are applied consecutively to clean the 'text' columns of both training and testing datasets simultaneously

In [19]:
inter_train['text']=inter_train['text'].apply(clean_text_round1)
inter_test['text']=inter_test['text'].apply(clean_text_round1)

inter_train.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,people receive wildfires evacuation orders in...,1
4,just got sent this photo from ruby alaska as s...,1


In [20]:
inter_train['text']=inter_train['text'].apply(contract)
inter_test['text']=inter_test['text'].apply(contract)

inter_train.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,people receive wildfires evacuation orders in...,1
4,just got sent this photo from ruby alaska as s...,1


In [21]:
inter_train['text']=inter_train['text'].apply(lemm)
inter_test['text']=inter_test['text'].apply(lemm)

inter_train.head()

Unnamed: 0,text,target
0,our deed be the reason of this earthquake may ...,1
1,forest fire near la ronge sask canada,1
2,all resident ask to shelter in place be be not...,1
3,people receive wildfire evacuation order in ...,1
4,just get send this photo from ruby alaska as s...,1


In [22]:
inter_train['text']=inter_train['text'].apply(clean_text_round2)
inter_test['text']=inter_test['text'].apply(clean_text_round2)

inter_train.head()

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive,1
1,forest fire near la ronge sask canada,1
2,resident ask shelter place notify officer evac...,1
3,people receive wildfire evacuation order calif...,1
4,get send photo ruby alaska smoke wildfires pou...,1


In [23]:
inter_train['text']=inter_train['text'].apply(lambda x: re.sub('[‘’]', '', x))
inter_test['text']=inter_test['text'].apply(lambda x: re.sub('[‘’]', '', x))

In [24]:
inter_train.head()

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive,1
1,forest fire near la ronge sask canada,1
2,resident ask shelter place notify officer evac...,1
3,people receive wildfire evacuation order calif...,1
4,get send photo ruby alaska smoke wildfires pou...,1


In [25]:
inter_test.head()

Unnamed: 0,text
0,happen terrible car crash
1,hear earthquake different city stay safe everyone
2,forest fire spot pond geese flee across street...
3,apocalypse light spokane wildfire
4,typhoon soudelor kill china taiwan


# Model Training and Evaluations
* ## Logistic Regression, XGB, RF, Ensembling, Decision Trees, Gradient Boosting are used...

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
X_train,X_test,Y_train,Y_test=train_test_split(inter_train['text'],inter_train['target'],train_size=0.7)

## LOGISTIC REGRESSION

In [28]:
from sklearn.linear_model import LogisticRegression
pipe1=Pipeline([('tfidfv',TfidfVectorizer()),
               ('lgr',LogisticRegression(solver='liblinear'))])

# Various hyperparameters were used in hit and trial, and the above gave the best result

model1=pipe1.fit(X_train,Y_train)
print(f1_score(Y_test,model1.predict(X_test)))

0.7278950370792926


## GRADIENT BOOST

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
pipe2 = Pipeline([('tfidft', TfidfVectorizer()),
                ('gb', GradientBoostingClassifier(max_depth=100))])
model2=pipe2.fit(X_train,Y_train)
print(f1_score(Y_test,model2.predict(X_test)))

0.6745098039215686


## ENSEMBLING of RF, LR, XGB

In [30]:
!pip install xgboost
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

model_1 = LogisticRegression(solver = 'liblinear')
model_2 = XGBClassifier(max_depth = 100)
model_3 = RandomForestClassifier(max_depth = 100)


pipe3 = Pipeline([('tfidfv', TfidfVectorizer()),

                ('model', VotingClassifier(estimators = [('lr', model_1), ('XGBoost',model_2) , ('rf', model_3)], voting='hard'))])

# Fitting the model

model3 = pipe3.fit(X_train, Y_train)
print(f1_score(Y_test,model3.predict(X_test)))


0.7140340575455079


## DECISION TREE CLASSIFIER

In [31]:
from sklearn.tree import DecisionTreeClassifier

pipe4 = Pipeline([('tfidfv', TfidfVectorizer()),
                ('model', DecisionTreeClassifier(max_depth = 75))])

# Fitting the model

model4 = pipe4.fit(X_train, Y_train)
print(f1_score(Y_test,model4.predict(X_test)))

0.6604759141033082


### SINCE LOGISTIC REGRESSION HAS HIGHEST SCORE, WE SUBMIT THIS MODEL FINALLY

In [32]:
pd.DataFrame({'id':'ids_test','target':model1.predict(inter_test['text'])}).to_csv('submission.csv',index=False)