Disaster Tweet Classification -Logistic Regression and Naive Bayes score of Logistic Regression - 0.7932 

In [None]:
#Imports:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from matplotlib import rcParams
from wordcloud import WordCloud

In [None]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', -1) 


In [None]:
#Code:
# reading the csv file into pandas dataframes
df=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')


In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
#creating a new column- length 
# this gives the length of the post
df['length'] = np.NaN
for i in range(0,len(df['text'])):
    df['length'][i]=(len(df['text'][i]))
df.length = df.length.astype(int)

In [None]:
#creating subplots to see distribution of length of tweet
sns.set_style("darkgrid");
f, (ax1, ax2) = plt.subplots(figsize=(12,6),nrows=1, ncols=2,tight_layout=True);
sns.distplot(df[df['target']==1]["length"],bins=30,ax=ax1);
sns.distplot(df[df['target']==0]["length"],bins=30,ax=ax2);
ax1.set_title('\n Distribution of length of tweet labelled Disaster\n');
ax2.set_title('\nDistribution of length of tweet labelled No Disaster\n ');
ax1.set_ylabel('Frequency');

In [None]:
# word cloud for words related to Disaster 
text=" ".join(post for post in df[df['target']==1].text)
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words related to Disaster \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
# word cloud for words related to No Disaster 
text=" ".join(post for post in df[df['target']==0].text)
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words related to No Disaster \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
#calculating basline accuracy
df['target'].value_counts(normalize=True)

### Tokenizing

When we "tokenize" data, we take it and split it up into distinct chunks based on some pattern.

In [None]:
# Import Tokenizer
from nltk.tokenize import RegexpTokenizer

In [None]:
# Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\w+') 


In [None]:
#changing the contents of selftext to lowercase
df.loc[:,'text'] = df.text.apply(lambda x : str.lower(x))

In [None]:
#removing hyper link, latin characters and digits
df['text']=df['text'].str.replace('http.*.*', '',regex = True)
df['text']=df['text'].str.replace('û.*.*', '',regex = True)
df['text']=df['text'].str.replace(r'\d+','',regex= True)

In [None]:
# "Run" Tokenizer
df['tokens'] = df['text'].map(tokenizer.tokenize)

In [None]:
#displaying first 5 rows of dataframe
df.head()

### Removing Stop Words

In [None]:
# Printing English stopwords
print(stopwords.words("english"))

In [None]:
#assigning stopwords to a variable
stop = stopwords.words("english")

In [None]:
# adding this stop word to list of stopwords as it appears on frequently occuring word
item=['amp'] #'https','co','http','û','ûò','ûó','û_'

In [None]:
stop.extend(item)

In [None]:
#removing stopwords from tokens
df['tokens']=df['tokens'].apply(lambda x: [item for item in x if item not in stop])

### Lemmatizing 
When we "lemmatize" data, we take words and attempt to return their *lemma*, or the base/dictionary form of a word.<br>


In [None]:
# Importing lemmatizer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Instantiating lemmatizer 
lemmatizer = WordNetLemmatizer()


In [None]:
lemmatize_words=[]
for i in range (len(df['tokens'])):
    word=''
    for j in range(len(df['tokens'][i])):
        lemm_word=lemmatizer.lemmatize(df['tokens'][i][j])#lemmatize
        
        word=word + ' '+lemm_word # joining tokens into sentence    
    lemmatize_words.append(word) # store in list
   

In [None]:
#creating a new column to store the result
df['lemmatized']=lemmatize_words

In [None]:
#displaying first 5 rows of dataframe
df.head()

## Modelling 
---
 This step creates two models 

>1.Logistic Regression Model<br>
>2.Naive Bayes Model<br>


In [None]:
#imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
#defining X and y for the model
X = df['lemmatized']
y = df['target']

In [None]:
# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
#ensuring that the value counts are quite evenly distributed
y_train.value_counts()

In [None]:
y_test.shape

### Logistic Regression Model

In [None]:
# pipeline will consist of two stages:
# 1.Instantiating countVectorizer
# 2.Instantiating logistic regression model

pipe = Pipeline([
    ('cvec', CountVectorizer()),  
    ('lr', LogisticRegression()) 
])

In [None]:
tuned_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2,3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=tuned_params, cv=3) # Evaluating model on unseen data

model_lr=gs.fit(X_train, y_train) # Fitting model

# This is the average of all cv folds for a single 
#combination of the parameters specified in the tuned_params
print(gs.best_score_) 

#displaying the best values of parameters
gs.best_params_

In [None]:
# Test score
gs.score(X_train, y_train)

In [None]:
# Test score
gs.score(X_test, y_test)

In [None]:
# Generating predictions!
predictions_lr = model_lr.predict(X_test)

In [None]:
# Importing the confusion matrix function
from sklearn.metrics import confusion_matrix

In [None]:
# Generating confusion matrix
confusion_matrix(y_test, predictions_lr)

In [None]:
#interpreting confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions_lr).ravel()

In [None]:
#values with coreesponding labels
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

### Naive Bayes Model

In [None]:
# Importing model
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Instantiating model
nb = MultinomialNB()

In [None]:
# Instantiating CountVectorizer.
cvec = CountVectorizer(max_features = 500)

In [None]:
# fit_transform() fits the model and transforms training data into feature vectors
X_train_cvec = cvec.fit_transform(X_train, y_train).todense()

In [None]:
#tranform test data and convert into array
X_test_cvec = cvec.transform(X_test).todense()

In [None]:
# Fitting model
model_nb=nb.fit(X_train_cvec, y_train)

In [None]:
# Generating predictions
predictions_nb = model_nb.predict(X_test_cvec)

In [None]:
# Training score
model_nb.score(X_train_cvec, y_train)

In [None]:
# Test score
model_nb.score(X_test_cvec, y_test)

In [None]:
# Generating confusion matrix
confusion_matrix(y_test, predictions_nb)

In [None]:
#interpreting confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions_nb).ravel()

In [None]:
#values with coreesponding labels
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

In [None]:
# word cloud for Frequntly occuring words related to Disaster
text=" ".join(post for post in df[df['target']==1].lemmatized)
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words related to Disaster \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
# word cloud for Frequntly occuring words related to No Disaster
text=" ".join(post for post in df[df['target']==0].lemmatized)
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words related to No Disaster \n\n',fontsize=18)
plt.axis("off")
plt.show()

# TEST DATA

In [None]:
#reading the test data
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
test.head()

In [None]:
#creating a new column- length 
# this gives the length of the post
test['length'] = np.NaN
for i in range(0,len(test['text'])):
    test['length'][i]=(len(test['text'][i]))
test.length = test.length.astype(int)

In [None]:
# word cloud for Frequntly occuring words in test dataframe
text=" ".join(post for post in df.text)
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words in test dataframe \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
# Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
#changing the contents of selftext to lowercase
test.loc[:,'text'] = test.text.apply(lambda x : str.lower(x))

In [None]:
#removing hyper link and latin characters
test['text']=test['text'].str.replace('http.*.*', '',regex = True)
test['text']=test['text'].str.replace('û.*.*', '',regex = True)
test['text']=test['text'].str.replace(r'\d+','',regex= True)

In [None]:
# "Run" Tokenizer
test['tokens'] = test['text'].map(tokenizer.tokenize)

In [None]:
#displaying first 5 rows of dataframe
test.head()

In [None]:
#removing stopwords from tokens
test['tokens']=test['tokens'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
lemmatize_words=[]
for i in range (len(test['tokens'])):
    word=''
    for j in range(len(test['tokens'][i])):
        lemm_word=lemmatizer.lemmatize(test['tokens'][i][j])#lemmatize
        
        word=word + ' '+lemm_word # joining tokens into sentence    
    lemmatize_words.append(word) # store in list
   

In [None]:
#creating a new column to store the result
test['lemmatized']=lemmatize_words

In [None]:
#displaying first 5 rows of dataframe
test.head()

In [None]:
# word cloud for Frequntly occuring words in test dataframe after lemmatizing
text=" ".join(post for post in df.lemmatized)
wordcloud = WordCloud(max_font_size=90, max_words=50, background_color="white", colormap="inferno").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title('\nFrequntly occuring words in test dataframe \n\n',fontsize=18)
plt.axis("off")
plt.show()

In [None]:
predictions_kaggle = model_lr.predict(test['lemmatized'])

In [None]:
#tranform test data and convert into array
kaggle_cvec = cvec.transform(test['lemmatized']).todense()

In [None]:
predictions_kaggle_nb=model_nb.predict(kaggle_cvec)

# Creating  .csv file

In [None]:
# Creating an empty data frame
submission_kaggle = pd.DataFrame()

In [None]:
# Assigning values to the data frame-submission_kaggle
submission_kaggle['Id'] = test.id
submission_kaggle['target'] = predictions_kaggle

In [None]:
# Head of submission_kaggle
submission_kaggle.head()

In [None]:
# saving data as  final_kaggle.csv
submission_kaggle.loc[ :].to_csv('final_kaggle.csv',index=False)

# NAIVE BAYES PREDICTION

In [None]:
# Creating an empty data frame
submission_kaggle_nb = pd.DataFrame()

In [None]:
# Assigning values to the data frame-submission_kaggle
submission_kaggle_nb['Id'] = test.id
submission_kaggle_nb['target'] = predictions_kaggle_nb

In [None]:
# Head of submission_kaggle
submission_kaggle_nb.head()

In [None]:
# saving data as  final_kaggle.csv
submission_kaggle_nb.loc[ :].to_csv('final_kaggle_nb.csv',index=False)