In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# IMPORTING REQUIRED LIBRARIES

In [2]:
#import the Libraries
import time
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import GaussianNB as NB 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR 
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#Import the Training Dataset
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Understanding Data

In [4]:
#Get the info on the shape of training dataset and list out the columns
print("Shape of the data",train_df.shape)
print(list(train_df.columns))

Shape of the data (7613, 5)
['id', 'keyword', 'location', 'text', 'target']


In [5]:
#General Description of the data
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
#Drop the features that aren't useful
train_df = train_df.drop(['id','keyword','location'],axis=1)
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# Data Cleaning

In [7]:
corpus = []
for i in range(0,train_df.shape[0]):
    #Replaces every character thats not an alphabet to " "
    review = re.sub('[^a-zA-Z]', " ",train_df['text'][i]) 
    #Convert all words to lower case and splits them
    review = review.lower() 
    review = review.split() 
    ps = PorterStemmer()
    updated_stopwords = stopwords.words('english') 
    #Remove the word 'not' from the list of stopwords 
    updated_stopwords.remove('not') 
    #Stemming
    review = [ps.stem(word) for word in review if not word in set(updated_stopwords)] 
    review = ' '.join(review)
    corpus.append(review)

print("Creating the corpus")
time.sleep(3)
print("Done!")
x = int(input("Want to view the corpus(1:YES/0:NO)?"))
if x==1:
    print("Printing")
    time.sleep(3)
    print("......")
    time.sleep(3)
    print(corpus)
else:
    pass

Creating the corpus
Done!


Want to view the corpus(1:YES/0:NO)? 0


# Creating a Bag of Words

In [8]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray() 
y = train_df.iloc[:,-1].values

# Dividing the training data into train and test data

In [9]:
x_train,x_test,y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

# Training the data on Naive Bayes, Logistic Regression and KNN

In [10]:
#Naive Bayes
classifier_nb = NB()
classifier_nb.fit(x_train,y_train)
y_pred_nb = classifier_nb.predict(x_test)

#Logistic Regression
classifier_lr = LR(random_state = 0)
classifier_lr.fit(x_train, y_train)
y_pred_lr = classifier_lr.predict(x_test)

#KNN
classifier_knn = KNN(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(x_train, y_train)
y_pred_knn = classifier_knn.predict(x_test)


# Reports on how well our different classifiers have perfomed

1. NAIVE BAYES ANALYSIS

In [11]:
print("NAIVE BAYES ANALYSIS")
time.sleep(1)
print(confusion_matrix(y_test,y_pred_nb))
print(classification_report(y_test,y_pred_nb))
print("Accuracy is",round(accuracy_score(y_test,y_pred_nb)*100),'%')


NAIVE BAYES ANALYSIS
[[387 487]
 [124 525]]
              precision    recall  f1-score   support

           0       0.76      0.44      0.56       874
           1       0.52      0.81      0.63       649

    accuracy                           0.60      1523
   macro avg       0.64      0.63      0.60      1523
weighted avg       0.66      0.60      0.59      1523

Accuracy is 60 %


2. LOGISTIC REGRESSION ANALYSIS

In [12]:
print("LOGISTIC REGRESSION ANALYSIS")
time.sleep(1)
print(confusion_matrix(y_test,y_pred_lr))
print(classification_report(y_test,y_pred_lr))
print("Accuracy is",round(accuracy_score(y_test,y_pred_lr)*100),'%')


LOGISTIC REGRESSION ANALYSIS
[[751 123]
 [199 450]]
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       874
           1       0.79      0.69      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523

Accuracy is 79 %


3. KNN ANALYSIS

In [13]:
print("KNN ANALYSIS")
time.sleep(1)
print(confusion_matrix(y_test,y_pred_knn))
print(classification_report(y_test,y_pred_knn))
print("Accuracy is",round(accuracy_score(y_test,y_pred_knn)*100),'%')

KNN ANALYSIS
[[862  12]
 [466 183]]
              precision    recall  f1-score   support

           0       0.65      0.99      0.78       874
           1       0.94      0.28      0.43       649

    accuracy                           0.69      1523
   macro avg       0.79      0.63      0.61      1523
weighted avg       0.77      0.69      0.63      1523

Accuracy is 69 %


We can see how well our classifiers have perfomed above. Logistic regression has performed better compared to others.

# Test data input

In [14]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Understanding test data

In [15]:
#Shape and list of columns of test data
print("Shape of test data",test_df.shape)
print(list(test_df.columns))

Shape of test data (3263, 4)
['id', 'keyword', 'location', 'text']


In [16]:
#General description of the test data
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [17]:
#Create a copy of test data
test_df_copy = test_df

In [18]:
#Display the copy
test_df_copy.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [19]:
#Drop the unnecessary columns
test_df = test_df.drop(['id','keyword','location'],axis=1)
test_df.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


# Bag of Words for test data

In [32]:
c = []
for i in range(0,3263):
    #Replaces every character thats not an alphabet to " "
    r = re.sub('[^a-zA-Z]', " ",test_df['text'][i]) 
    #Convert all words to lower case and splits them
    r = r.lower() 
    r = r.split() 
    psi = PorterStemmer()
    u_s = stopwords.words('english') 
    #Remove the word 'not' from the list of stopwords 
    u_s.remove('not') 
    #Stemming
    r = [psi.stem(word) for word in r if not word in set(u_s)] 
    r = ' '.join(r)
    c.append(r)

In [33]:
test_X = cv.transform(c)

# Prediction!

In [34]:
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = classifier_lr.predict(test_X)

# Submission

In [35]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [36]:
sample_submission.to_csv("submission.csv", index=False)