#Load library and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

import nltk

from nltk.corpus import stopwords

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


#Data Preprocessing on text data

In [8]:
!pip install contractions
!pip install unidecode
from contractions import fix
from unidecode import unidecode

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Collecting unidecode
  Downloading Unidecode-1.3

In [9]:
from string import punctuation
from nltk.stem import WordNetLemmatizer, LancasterStemmer

In [10]:
# remove newlines , spaces
def remove_blank(data):
    text=data.replace("\\n"," ").replace("\t"," ")
    return text

In [11]:
# Contractions mapping
def contract_text(data):
    text=fix(data)
    return text

# handling accented character
def handling_accented_chr(data):
    text=unidecode(data)
    return text

In [12]:
# remove stopwords
stopwords_list=stopwords.words('english')
stopwords_list.remove('no')
stopwords_list.remove('nor')
stopwords_list.remove('not')

# clean the text
def clean_text(data):
    # tokenization
    tokens=nltk.word_tokenize(data)
    clean_data=[i.lower() for i in tokens if (i.lower() not in punctuation) and (i.lower() not in stopwords_list) and (len(i)>2) and (i.isalpha())]
    return clean_data

def lemmatization(data):
    final_text=[]
    lemmatizing=WordNetLemmatizer()
    for i in data:
        lemma=lemmatizing.lemmatize(i)
        final_text.append(lemma)

    return " ".join(final_text)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train,x_test,y_train,y_test=train_test_split(train_df.text, train_df.target,test_size=0.15)

In [15]:
print('Shape (x , y) for train and test data \n',x_train.shape,'\n',x_test.shape,'\n', y_train.shape,'\n', y_test.shape)

Shape (x , y) for train and test data 
 (6471,) 
 (1142,) 
 (6471,) 
 (1142,)


In [16]:
clean_train=x_train.apply(remove_blank)
clean_test=x_test.apply(remove_blank)

clean_train=clean_train.apply(contract_text)
clean_test=clean_test.apply(contract_text)

clean_train=clean_train.apply(handling_accented_chr)
clean_test=clean_test.apply(handling_accented_chr)

clean_train=clean_train.apply(clean_text)
clean_test=clean_test.apply(clean_text)

clean_train=clean_train.apply(lemmatization)
clean_test=clean_test.apply(lemmatization)

In [17]:
## For Test Data
test_data=test_df.text.apply(remove_blank)

test_data=test_data.apply(contract_text)

test_data=test_data.apply(handling_accented_chr)

test_data=test_data.apply(clean_text)

test_data=test_data.apply(lemmatization)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [19]:
count = CountVectorizer()
count_train = count.fit_transform(clean_train)
count_test = count.transform(clean_test)

count_test_data = count.transform(test_data)

In [20]:
count_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
d = pd.DataFrame(count_train.toarray(), columns=count.get_feature_names_out())
d

Unnamed: 0,aaaa,aaaaaaallll,aaarrrgghhh,aan,aannnnd,aaronthefm,ab,aba,abandon,abandoned,...,zone,zoom,zotar,zouma,zourryart,zrnf,zumiez,zurich,zxathetis,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#Creating Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report,confusion_matrix

In [23]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', MultinomialNB()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]
for name, model in models:
    model.fit(count_train.toarray(), y_train)
    y_pred = model.predict(count_test.toarray())
    print("---------------Model ", model, " evaluation on Test data-----------------------")
    print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
    print()
    print("Classification Report:\n",classification_report(y_test,y_pred))
    print()

    acc_test=accuracy_score(y_test,y_pred)*100
    print('Accuracy for Test Data is:',acc_test)
    print("********************************************************")

    y_pred_train=model.predict(count_train.toarray())
    #y_pred_train
    print("---------------Model ",model," evaluation on Train data-----------------------")
    print("Confusion Matrix:\n",confusion_matrix(y_train,y_pred_train))
    print()
    print("Classification Report:\n",classification_report(y_train,y_pred_train))
    print()

    acc_train=accuracy_score(y_train,y_pred_train)*100
    print('Accuracy for Train Data is:',acc_train)

---------------Model  LogisticRegression()  evaluation on Test data-----------------------
Confusion Matrix:
 [[568  81]
 [153 340]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.88      0.83       649
           1       0.81      0.69      0.74       493

    accuracy                           0.80      1142
   macro avg       0.80      0.78      0.79      1142
weighted avg       0.80      0.80      0.79      1142


Accuracy for Test Data is: 79.50963222416813
********************************************************
---------------Model  LogisticRegression()  evaluation on Train data-----------------------
Confusion Matrix:
 [[3623   70]
 [ 251 2527]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96      3693
           1       0.97      0.91      0.94      2778

    accuracy                           0.95      6471
   macro avg       0.95      0.

#Predict

In [24]:
rf = RandomForestClassifier()

In [25]:
rf.fit(count_train.toarray(),y_train)

In [26]:
y_pred = rf.predict(count_test.toarray())

In [27]:
y_pred[:15]    # Predicted Label

array([1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])

In [28]:
y_test[:15]    # Actual Label

4259    1
6694    0
802     0
2383    0
1198    1
2047    0
1541    0
797     0
2106    0
2259    0
1101    1
3919    0
4158    0
7381    0
591     0
Name: target, dtype: int64

In [29]:
# Predictions On Unseen Data
predictions = rf.predict(count_test_data.toarray())

In [30]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [31]:
result = pd.DataFrame()
result['id'] = test_df['id']
result['target'] = predictions

In [32]:
result.to_csv('submission.csv',index_label=False)

In [33]:
df= pd.read_csv('submission.csv')
df

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
