In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
import pickle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [3]:
df = pd.read_csv('./datasets/kcbsamfmtraffic.csv')
block = ['block','crash','debris','crawl','jammed','emergency','accident','closed']

def classify(x):
    return bool([word for word in block if(word in x.lower())])

df['class'] = df['tweet'].astype(str).map(classify)
df['class'].value_counts(normalize=True)
df.to_csv('./datasets/kcbsamfmtraffic_labeled.csv',index=False)
df.dropna(inplace=True)
X = df['tweet']
y = df['class']
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state=123)
pipe = Pipeline([
    ('tvec',TfidfVectorizer(stop_words='english')),
    ('lr',LogisticRegression())
])

params = {
    'tvec__max_features':[500],
    'tvec__min_df':[100,150,200],
    'tvec__max_df':[.9],
    'tvec__ngram_range':[(1,1),(1,2)],
    'tvec__use_idf':[False,True]
}

gs = GridSearchCV(pipe,params,cv=5)
gs.fit(X_train,y_train)
print(f'Train score: {gs.best_score_} ')
print(f'Test score: {gs.best_estimator_.score(X_test,y_test)}')





Train score: 0.9618463180362861 
Test score: 0.9752




In [4]:
filename = 'predictor.sav'
pickle.dump(gs.best_estimator_, open(filename, 'wb'))