In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import confusion_matrix

In [2]:
path = "./data/fin_df_103019.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,text,hashtags,user_name,date,user_location,label
0,Congrats pre,,MasterAR9012502,2019-10-24 15:05:01+00:00,,0
1,Wassup?,,theprejon,2019-10-21 20:54:33+00:00,,0
2,"Ini mbc korea yang komen orang indo semueh,, b...",,KhoyrR,2019-10-24 14:19:01+00:00,,0
3,Meron pre?,,Zendriccccc,2019-10-26 13:00:29+00:00,,0
4,this is their pre-p3p5 fam love story thanks,,akihikologist,2019-10-21 02:44:47+00:00,,0


In [3]:
df["text"].isnull().sum()

1

In [4]:
df.dropna(subset=["text"], inplace=True)

In [5]:
df["label"].value_counts()

0    1839
1     804
2      84
Name: label, dtype: int64

In [6]:
df = df[df["label"] != 2]

In [7]:
pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("nb", MultinomialNB())
])

In [8]:
stop_word_list = list(ENGLISH_STOP_WORDS) + ['fire', 'firefighters', 
                                             'twitter', 'pic', 'www', 'http', 'https','massive', 
                                             'wildfire', 'busted', 'burns', 'patch', 'california', 
                                             'utm_campaign','com', 'ho', 'bad', 'new', 'round', '3a', 
                                             'trueanthem', 'content', 'content_utm']

In [9]:
params = {
    "cvec__ngram_range": [(1,1), (1,2)],
    "cvec__stop_words": [stop_word_list, "english"],
    "cvec__max_features": [5000, 7000, 9000]
}

In [10]:
X = df["text"]
y = df["label"]

In [11]:
X.shape

(2643,)

In [12]:
y.shape

(2643,)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=740)

In [14]:
X_train.shape

(1982,)

In [15]:
y_train.shape

(1982,)

In [16]:
X_test.shape

(661,)

In [17]:
y_test.shape

(661,)

In [18]:
gs = GridSearchCV(pipe, params, cv=5)

In [19]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [20]:
gs.score(X_train, y_train)

0.9369323915237134

In [21]:
gs.score(X_test, y_test)

0.8502269288956127

In [22]:
def what_score(your_model, your_X, your_y):
    """
    Return train score, test score, cv score
    
    Parameter:
    your_model: estimator
    your X: X
    your y: y
    """
    X_train, X_test, y_train, y_test = train_test_split(your_X,your_y, random_state = 740)
    train_score = your_model.score(X_train, y_train)
    test_score = your_model.score(X_test, y_test)
    cv_score = cross_val_score(your_model, X_train, y_train, cv = 5).mean()
    
    preds = your_model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    
    sens = (tp / (tp+fn))
    spec = (tn / (tn+fp))
    
#     pd.DataFrame({'train': train_score, 'test': test_score, 'cv': cv_score})
    
    out = {'train': train_score, 'test':test_score,
           "cv score": cv_score, "sensitivity": sens,
           "specificity": spec}
    
    return out

In [23]:
preds = gs.predict(X_test)

In [24]:
confusion_matrix(y_test, preds)

array([[387,  77],
       [ 22, 175]])

In [25]:
what_score(gs, X, y)

{'train': 0.9369323915237134,
 'test': 0.8502269288956127,
 'cv score': 0.8279622420680356,
 'sensitivity': 0.8883248730964467,
 'specificity': 0.834051724137931}