# Introduction
***
We will make a model that can predict wheter it is real disaster (1) or fake disaster (0) given textual data based on tweets

# Dataset
***
Dataset URL source: https://www.kaggle.com/c/nlp-getting-started/overview <br>
Dataset name: **Natural Language Processing with Disaster Tweets** <br>
Dataset class: **0** for fake disaster || **1** real disaster

# Import Libraries

In [1]:
# for data processing
import pandas as pd
import numpy as np

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

# Load Dataset

In [2]:
df_train = pd.read_csv('dataset/train.csv')

## Inspect the train data

### See overall data

In [3]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


### Check number of missing values

In [4]:
df_train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

### Check class proportions

In [5]:
df_train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

It seems like unbalanced dataset. Number of **fake disaster** is greater than **real disaster**

# Data Preprocessing
***
This step includes:
1. Remove punctuations and special characters
2. cleaning texts
3. Removing stop word
4. Lemmatization

## Cleaning text
***
1. Remove punctuations
2. Remove special characters
3. Remove URL and hashtags
4. Remove leading, trailing, and extra whitespace, tabs, and enter (if any)
5. Typos
6. Slang (in english) are corrected
7. Abbreviations are written in their long forms

In [6]:
def cleaning_text(text):
    text = text.lower()
    text = text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return str(text)

## Stopword removal

In [7]:
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

## Lemmatization

In [8]:
# initialize the lemmatizer
word_lemmatizer = WordNetLemmatizer()

# define a function for mapping NLTK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatizer(string):
    # Get position tags
    word_pos_tags = nltk.pos_tag(word_tokenize(string))
    # Map the position tag and lemmatize the word/token
    a=[word_lemmatizer.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
    return " ".join(a)

## Final Preprocessing

In [9]:
def final_preprocessing(string):
    return lemmatizer(stopword(cleaning_text(string)))

In [10]:
df_train['clean_text'] = df_train['text'].apply(
    lambda x: final_preprocessing(x)
)

In [11]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get sent photo ruby alaska smoke wildfires pou...


# Convert textual data into numerical data (vectorization)

In [12]:
# first we will split df_train into training (80) and validation (20) set
X_train, X_val, y_train, y_val =  train_test_split(df_train["clean_text"],df_train["target"],test_size=0.2,shuffle=True)

In [13]:
X_train

2055        dangerous ok rest u danger https co yl dkf tb
1526    syringetoanger åç probably even disagreements ...
6924    live update boyd get trouble th http co ugfpwm...
4655    inundate westeros storm sword book tape amp fi...
5463    slosheriff south gate police officer huntingto...
                              ...                        
2706    ignition knock detonation sensor senso standar...
3087                                     let go passenger
2409    epic insanity get derailed outside grimrail depot
2240    tomorrow internet day almost month look forwar...
5641            wowo nigerian refugee repatriate cameroon
Name: clean_text, Length: 6090, dtype: object

In [14]:
y_train

2055    0
1526    0
6924    0
4655    0
5463    0
       ..
2706    0
3087    0
2409    1
2240    0
5641    0
Name: target, Length: 6090, dtype: int64

In [15]:
X_val

7431    national free root beer float day tomorrow amp...
4243    skinny jean hazardous health socialnews http c...
3337    dead miss family evacuate due flood bukidnon h...
7252    navy charge officer weapon violation chattanoo...
4155    standardized test harm child color http co id ...
                              ...                        
1436                      become another casualty society
7005                    listen llegaste tu twister el rey
5117    finnish nuclear plant move ahead financing sec...
3085    need u delete start cry computer electrocute h...
6226    mental twitter note make sure smoke alarm batt...
Name: clean_text, Length: 1523, dtype: object

## Tokenization

In [16]:
X_train_tokenize = [nltk.word_tokenize(i) for i in X_train]
X_train_tokenize

[['dangerous', 'ok', 'rest', 'u', 'danger', 'https', 'co', 'yl', 'dkf', 'tb'],
 ['syringetoanger',
  'åç',
  'probably',
  'even',
  'disagreements',
  'general',
  'ross',
  'catastrophic',
  'occurrence',
  'make',
  'something',
  'clear',
  'åè'],
 ['live',
  'update',
  'boyd',
  'get',
  'trouble',
  'th',
  'http',
  'co',
  'ugfpwmy',
  'x',
  'via',
  'detroitnews'],
 ['inundate',
  'westeros',
  'storm',
  'sword',
  'book',
  'tape',
  'amp',
  'finish',
  'game',
  'throne',
  'know',
  'starks',
  'greyjoys'],
 ['slosheriff',
  'south',
  'gate',
  'police',
  'officer',
  'huntington',
  'park',
  'officer',
  'arrest',
  'child',
  'abuse',
  'investigation',
  'boot',
  'camp'],
 ['sinkhole',
  'selfies',
  'wont',
  'believe',
  'brooklyn',
  'sinkhole',
  'http',
  'co',
  'glyoyf',
  'oc'],
 ['afghan',
  'conflict',
  'see',
  'sharp',
  'rise',
  'female',
  'casualty',
  'http',
  'co',
  'hcywrwn',
  'l',
  'http',
  'co',
  'twxz',
  'vxbx'],
 ['landslide', 'trip

## TF-IDF

In [17]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)

In [18]:
X_train_vectors_tfidf.shape

(6090, 17362)

In [19]:
X_val_vectors_tfidf.shape

(1523, 17362)

In [20]:
X_train_vectors_tfidf.todense().shape

(6090, 17362)

In [21]:
# make dataframe from sparse matrix into dense matrix
feature_columns_tokenize = tfidf_vectorizer.get_feature_names_out()
df_X_train_tfidf = pd.DataFrame(columns = feature_columns_tokenize, data=X_train_vectors_tfidf.todense())
df_X_val_tfidf = pd.DataFrame(columns = feature_columns_tokenize, data=X_val_vectors_tfidf.todense())

In [22]:
df_X_train_tfidf.head()

Unnamed: 0,aa,aaaa,aaaaaaallll,aace,aag,aal,aan,aar,aatt,aauizggc,...,ûïwe,ûïwhen,ûïyou,ûò,ûòthe,ûòåêcnbc,ûó,ûóher,ûókody,ûûif
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_X_val_tfidf.head()

Unnamed: 0,aa,aaaa,aaaaaaallll,aace,aag,aal,aan,aar,aatt,aauizggc,...,ûïwe,ûïwhen,ûïyou,ûò,ûòthe,ûòåêcnbc,ûó,ûóher,ûókody,ûûif
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelling using Benchmark Machine Learning Model

In [24]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(df_X_train_tfidf,y_train)

KNeighborsClassifier()

In [25]:
y_pred = knn.predict(df_X_val_tfidf)

In [26]:
confusion_matrix(y_val, y_pred)

array([[737, 124],
       [227, 435]], dtype=int64)

In [27]:
accuracy_score(y_pred, y_val)

0.7695338148391333

In [28]:
f1_score(y_pred, y_val)

0.7125307125307125