In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re

from multiprocessing import cpu_count, Pool #for multiprocessing data
cores = cpu_count()

from gensim.models.doc2vec import Doc2Vec, TaggedDocument #for doc2vec modelling
from IPython.display import clear_output

from tqdm import tqdm

import pickle

from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
df = pd.read_pickle('./Cleaned/ejks_cleaned.pkl')[['lemmatized', 'label']]
df.head()

Unnamed: 0,lemmatized,label
0,"[bonding, grief, family, urban, poor, communit...",ejks
1,"[general, santos, city, number, drug, suspect,...",ejks
2,"[brig, gen, debold, sinas, director, national,...",ejks
3,"[politics, eyed, police, say, politics, may, b...",ejks
4,"[general, santos, city, president, rodrigo, du...",ejks


In [3]:
len(df)

371

In [4]:
df1 = pd.read_pickle('./Cleaned/disaster_cleaned.pkl')[['lemmatized', 'label']]
df1.head()

Unnamed: 0,lemmatized,label
0,"[eastern, samar, gone, really, dont, know, sta...",disaster
1,"[market, stall, product, damaged, washed, debr...",disaster
2,"[knew, super, typhoon, strength, really, took,...",disaster
3,"[compared, wind, rain, strong, staed, whistle,...",disaster
4,"[thought, prepared, used, typhoon, know, feel,...",disaster


In [5]:
len(df1)

122

In [6]:
main = df.append(df1, ignore_index=True)

In [7]:
main.label.value_counts()

ejks        371
disaster    122
Name: label, dtype: int64

### Upsampling

In [8]:
from sklearn.utils import resample

In [9]:
df_majority = main[main.label == 'ejks']
df_minority = main[main.label == 'disaster']

In [10]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=371,    # to match majority class
                                 random_state=123) # reproducible results

In [11]:
main = pd.concat([df_majority, df_minority_upsampled])

### Doc2Vec

In [13]:
print('Tagging docs...')
docs = [x for x in main['lemmatized'].values]
# docs = [x for x in main_df['content_clean'].values]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
print('Docs tagged.')
# tqdm.pandas(desc="progress-bar")

print('Training start.')
# epoch_logger = EpochLogger()
epochs = 50
vec_size = 50
alpha = 0.025

print('Initializing Model...')
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                workers = cores,
                dm = 1, epochs = epochs)
#                 callbacks=[epoch_logger])
                

print('Model Initalized. Training...')
model.build_vocab(documents = documents, progress_per=1)
print('Training complete.')

print('Saving model...')
model.save("models/doc2vec.model")
print("Model Saved")

Tagging docs...
Docs tagged.
Training start.
Initializing Model...
Model Initalized. Training...
Training complete.
Saving model...
Model Saved


In [16]:
vectors = [model.docvecs[i] for i in range(len(main))]
main['vector_d2v'] = vectors

In [17]:
main.head()

Unnamed: 0,lemmatized,label,vector_d2v
0,"[bonding, grief, family, urban, poor, communit...",ejks,"[0.008891293, -0.0074158455, 0.0037703046, 0.0..."
1,"[general, santos, city, number, drug, suspect,...",ejks,"[0.00690979, -0.0047741365, 0.0017735371, -6.6..."
2,"[brig, gen, debold, sinas, director, national,...",ejks,"[0.007832452, -0.002955307, 0.0013261527, -0.0..."
3,"[politics, eyed, police, say, politics, may, b...",ejks,"[0.007552774, -0.0032720529, 0.002190555, 0.00..."
4,"[general, santos, city, president, rodrigo, du...",ejks,"[-0.0059859795, -0.004373776, 0.002142897, -0...."


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [16]:
def get_X(col):
    li = []
    for i in col:
        li.append(i.reshape(1, -1))
    return np.concatenate(li, axis = 0)

In [17]:
X_sub = main['vector_d2v']
X = get_X(X_sub)
y = main['label']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=49)

## 1.) SVM

In [19]:
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
y_pred_svm = clf.predict(X_test)

In [21]:
print(classification_report(y_pred_svm, y_test))

              precision    recall  f1-score   support

    disaster       0.58      0.53      0.55        80
        ejks       0.50      0.55      0.52        69

    accuracy                           0.54       149
   macro avg       0.54      0.54      0.54       149
weighted avg       0.54      0.54      0.54       149



In [22]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_svm))

Accuracy: 0.5369127516778524


## 2.) Random Forest

In [23]:
rf = RandomForestClassifier(max_depth=50, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [24]:
y_pred_rf = rf.predict(X_test)

In [25]:
print(classification_report(y_pred_rf, y_test))

              precision    recall  f1-score   support

    disaster       0.59      0.54      0.56        80
        ejks       0.51      0.57      0.54        69

    accuracy                           0.55       149
   macro avg       0.55      0.55      0.55       149
weighted avg       0.55      0.55      0.55       149



In [26]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

Accuracy: 0.5503355704697986
