### Baseline text classification

We will use methods like count vecorizer and TFIDF vectorizer to build our baseline models

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

### Data Preparation

In [0]:
data_path = "/content/drive/My Drive/Datahack NLP Workshop/Disaster/"
df = pd.read_csv(data_path + "socialmedia_disaster_tweets.csv", encoding='iso-8859-1')
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [0]:
df = df[["choose_one", "text"]]
df.columns = ["label", "text"]
df.head()

Unnamed: 0,label,text
0,Relevant,Just happened a terrible car crash
1,Relevant,Our Deeds are the Reason of this #earthquake M...
2,Relevant,"Heard about #earthquake is different cities, s..."
3,Relevant,"there is a forest fire at spot pond, geese are..."
4,Relevant,Forest fire near La Ronge Sask. Canada


In [0]:
df["label"].value_counts()

Not Relevant    6187
Relevant        4673
Can't Decide      16
Name: label, dtype: int64

In [0]:
df = df[df["label"].isin(["Relevant", "Not Relevant"])].reset_index(drop=True)
df.label.value_counts()

Not Relevant    6187
Relevant        4673
Name: label, dtype: int64

### Baseline Model

In [0]:
# apply pre-processing, stemming, lem
tfidf_vec = CountVectorizer(ngram_range=(1, 2)) #max_df, min_df, max_features
X = tfidf_vec.fit_transform(df["text"])
label_map = {"Not Relevant":0, "Relevant":1}
y = (df["label"].map(label_map)).values

In [0]:
kf = KFold(n_splits=5, shuffle=True, random_state=2019)
cv_preds = np.zeros(X.shape[0])
for dev_index, val_index in kf.split(X):
    dev_X, val_X = X[dev_index,:], X[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]
    
    clf = SGDClassifier(loss="log")
    clf.fit(dev_X, dev_y)
    val_preds = clf.predict_proba(val_X)[:,1]
    cv_preds[val_index] = val_preds

In [0]:
from sklearn import metrics
metrics.roc_auc_score(y, cv_preds)

0.8604140392118097

In [0]:
print(metrics.classification_report(y, (cv_preds>0.5)))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83      6187
           1       0.81      0.69      0.75      4673

    accuracy                           0.80     10860
   macro avg       0.80      0.79      0.79     10860
weighted avg       0.80      0.80      0.80     10860



In [0]:
from sklearn.naive_bayes import BernoulliNB

kf = KFold(n_splits=5, shuffle=True, random_state=2019)
cv_preds = np.zeros(X.shape[0])
for dev_index, val_index in kf.split(X):
    dev_X, val_X = X[dev_index,:], X[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]
    
    clf = BernoulliNB()
    clf.fit(dev_X, dev_y)
    val_preds = clf.predict_proba(val_X)[:,1]
    cv_preds[val_index] = val_preds

In [0]:
from sklearn import metrics
metrics.roc_auc_score(y, cv_preds)

0.855818968491502

In [0]:
print(metrics.classification_report(y, (cv_preds>0.5)))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84      6187
           1       0.90      0.58      0.71      4673

    accuracy                           0.79     10860
   macro avg       0.83      0.77      0.77     10860
weighted avg       0.82      0.79      0.78     10860



### DIY - Build Models using TFIDF

In [0]:
# apply pre-processing, stemming, lem
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=9000, min_df = 4, binary = True) #max_df, min_df, max_features
X = tfidf_vec.fit_transform(df["text"])
label_map = {"Not Relevant":0, "Relevant":1}
y = (df["label"].map(label_map)).values

In [0]:
kf = KFold(n_splits=5, shuffle=True, random_state=2019)
cv_preds = np.zeros(X.shape[0])
for dev_index, val_index in kf.split(X):
    dev_X, val_X = X[dev_index,:], X[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]
    
    clf = SGDClassifier(loss="log")
    clf.fit(dev_X, dev_y)
    val_preds = clf.predict_proba(val_X)[:,1]
    cv_preds[val_index] = val_preds

In [0]:
from sklearn import metrics
metrics.roc_auc_score(y, cv_preds)

0.8671032166013861