In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Bag Of Words [NLP METHOD]

![image.png](attachment:image.png)

I feel this is the simplest method in NLP. It includes a basic concept that is creating the text into a vector form which includes the frequency of each word in the sentence - hence creating a vector which will be an independent feature for our model and the classification of text(1 or 0 as in this dataset) will be the dependent feature. Model can be trained using any algorithm (Decision Tree Classifier, Naive Bais). However this method is less efficient as it ignores the semantic part of the text. 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
sns.heatmap(df.isnull())

The columns - keyword and location will be dropped as they are of no use to us.

In [None]:
df.drop(['location','keyword'],axis=1,inplace=True)

In [None]:
df

In [None]:
real = df[df['target']==1]

In [None]:
real

In [None]:
unreal = df[df['target']==0]

In [None]:
unreal

In [None]:
print('real disaster message percentage:',(len(real)/len(df))*100)

In [None]:
print('fake disaster message percentage:',(len(unreal)/len(df))*100)

Data Visualisation

In [None]:
sns.countplot(df['target'])

We have quite balanced data!

In [None]:
import string


In [None]:
string.punctuation

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords.words('english');

In [None]:
def message_cleaning(message):
    test_punc_removed = [char   for char in message if char not in string.punctuation]
    test_punc_removed_joined = ''.join(test_punc_removed)
    test_punc_removed_joined_clean = [word   for word in test_punc_removed_joined.split(' ') if word not in stopwords.words('english')]
    return test_punc_removed_joined_clean

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer=message_cleaning)

We are going to vectorize the text along with increasing the readablity of the text by removing the punctuations and countwords!

In [None]:
disaster_tweet_vectorizer = vectorizer.fit_transform(df['text'])

In [None]:
print(vectorizer.get_feature_names());

In [None]:
print(disaster_tweet_vectorizer.toarray())

This matrix shows the count of unique words (as shown in previous cell output) in each sentance.

In [None]:
disaster_tweet_vectorizer.shape

So there are 7613 disaster tweets and 21637 unique words.

In [None]:
label = df['target']

In [None]:
label.shape

**Training the Model**

In [None]:
X = disaster_tweet_vectorizer

In [None]:
X = X.toarray()

In [None]:
X

In [None]:
y = label

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


In [None]:
LR = LogisticRegression()
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier()
NB = GaussianNB()

In [None]:
RFC.fit(X_train,y_train)
DTC.fit(X_train,y_train)
NB.fit(X_train,y_train)
LR.fit(X_train,y_train)

Making Prediction

In [None]:
predict1 = RFC.predict(X_test)
predict2 = DTC.predict(X_test)
predict3 = NB.predict(X_test)
predict4 = LR.predict(X_test)

**Checking the Performance**

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print(classification_report(y_test,prediction))

In [None]:
print(accuracy_score(y_test,predict1))
print('\n')
print(accuracy_score(y_test,predict2))
print('\n')
print(accuracy_score(y_test,predict3))
print('\n')
print(accuracy_score(y_test,predict4))

Highest Accuracy Using Logisitic Regression.

The accuracy is quite good!

Lets predict for testing dataset

In [None]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
test_df.head()

In [None]:
test_df.drop(['keyword','location'],axis=1,inplace= True)

In [None]:
test_df.head()

In [None]:
test_vectorizer = vectorizer.transform(test_df['text'])

In [None]:
test_vectorizer.shape

Lets Predict!

In [None]:
final_predictions = LR.predict(test_vectorizer)

In [None]:
final_predictions

In [None]:
submission_df = pd.DataFrame()

In [None]:
submission_df['id'] = test_df['id']
submission_df['target'] = final_predictions

In [None]:
submission_df['target'].value_counts()