### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import re
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df = pd.read_csv('E:\Kaggle Playground\Disaster prediction using Tweets\train.csv')

In [4]:
df.shape

(7613, 5)

In [5]:
# almost balanced dataset
df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

### Preprocessing data I (removing useless data): Removing stopwords, tokenization, lemmatization, external web links and numbers using RegEx

In [6]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
df.drop(columns='location', inplace=True)

In [8]:
# ACCEPTS STRING => RETURNS STRING
def link_hashtag_remover(text):
  # reference : https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
  # pattern = 'http:[\/]{2}[a-z].[a-z]*[\/][a-zA-Z0-9]*|#[a-zA-Z0-9]*|@[a-zA-Z0-9]*'

  preprocessed = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
  #re.sub() is used to replace a given pattern with a replacement pattern
  return preprocessed


# ACCEPTS STRING => RETURNS LIST
def stopwords_punct_remover(text):
  doc = nlp(text)

  preprocessed = []

  for token in doc:
    if(token.is_stop == False):
      if(token.pos_ not in ["PUNCT", "SPACE"]):
        # print(token.text)
        preprocessed.append(token.text)

  return preprocessed
  #return preprocessed

# ACCEPTS LIST => RETURNS STRING
def lemmatizer(text):
  text = " ".join(text)
  str1 = ""
  doc = nlp(text)
  for token in doc:
    str1 += token.lemma_
    str1 += " "
  return str1

### Creating a custom pipeline for doing all the things simultaneously

In [9]:
def pipeline(text):
  text1 = link_hashtag_remover(text)
  text2 = stopwords_punct_remover(text1)
  text3 = lemmatizer(text2)

  return text3

In [10]:
X = pd.DataFrame()
X['text'] = df['text'].map(lambda x: pipeline(x))

In [24]:
X.head()

Unnamed: 0,text
0,deed Reason earthquake ALLAH Forgive
1,forest fire near La Ronge Sask Canada
2,resident ask shelter place notify officer evac...
3,13 000 people receive wildfire evacuation orde...
4,got send photo Ruby Alaska smoke wildfire pour...


### Split training data into train and testing data

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, df['target'], test_size = 0.2, random_state = 42)

In [28]:
print(f'Training size is: {x_train.shape}')
print(f'Testing size is: {x_test.shape}')

Training size is: (6090, 1)
Testing size is: (1523, 1)


### Preprocessing train data 2 (converting text data into usefull form): using Vectorizers

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [30]:
# We will use X instead of x_train, as there might be some keywords in X which are missing in x_train
vectorizer.fit(X['text'])

In [31]:
train_input_list = x_train['text'].tolist()
test_input_list = x_test['text'].tolist()

In [38]:
print(x_train.shape)

(6090, 1)


In [41]:
print(train_input_list[0])

courageous honest analysis need use Atomic Bomb 1945 Hiroshima70 japanese military refuse surrender 


In [48]:
train_transformed_data = vectorizer.transform(train_input_list)
test_transformed_data = vectorizer.transform(test_input_list)

In [61]:
type(train_transformed_data)

scipy.sparse._csr.csr_matrix

## Model building and training

### Multinomial Naive Bayes

In [68]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [50]:
nb.fit(train_transformed_data, y_train)

In [53]:
y_predicted = nb.predict(test_transformed_data)

In [54]:
y_predicted

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [56]:
y_test.to_numpy()

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [60]:
misclassified = np.sum(y_test != y_predicted)
print(f"Accuracy of Naive Bayes on Test: {(len(y_predicted) - misclassified)*100/len(y_predicted)}")

Accuracy of Naive Bayes on Test: 79.25147734734078


### SVM

In [75]:
from sklearn.svm import SVC
clf = SVC(kernel = 'poly')
clf.fit(train_transformed_data, y_train)

In [76]:
y_predicted = clf.predict(test_transformed_data)

In [77]:
y_predicted

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [71]:
misclassified = np.sum(y_test != y_predicted)
print(f"Accuracy of SVM on Test: {(len(y_predicted) - misclassified)*100/len(y_predicted)}")

Accuracy of SVM on Test: 69.59947472094551


### Random Forest Classifier

In [78]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(train_transformed_data, y_train)

In [79]:
y_predicted = clf.predict(test_transformed_data)

In [80]:
y_predicted

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [81]:
misclassified = np.sum(y_test != y_predicted)
print(f"Accuracy of SVM on Test: {(len(y_predicted) - misclassified)*100/len(y_predicted)}")

Accuracy of SVM on Test: 59.48785292186474


### Gradient Classifier

In [82]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0)
clf.fit(train_transformed_data, y_train)

In [83]:
y_predicted = clf.predict(test_transformed_data)

In [84]:
y_predicted

array([1, 0, 0, ..., 1, 1, 0], dtype=int64)

In [85]:
misclassified = np.sum(y_test != y_predicted)
print(f"Accuracy of Gradient Boost on Test: {(len(y_predicted) - misclassified)*100/len(y_predicted)}")

Accuracy of Gradient Boost on Test: 77.28168089297439


Gradient Boosting Classifier has performed better than any other ML classifier.