In [37]:
import numpy as np
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer , TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
import pickle

In [13]:
#Do these steps when we r not using spacy for NLP
# df_train = pd.read_pickle('data/data_train.pkl')
# df_train.info()

# #change above X if we r using tokenize and other nlp process
# X = df_train['content']
# y = df_train['label']

# tfidf_vectorizer  = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1,2), max_df= 0.85, min_df= 2)
# X_tfidf = tfidf_vectorizer.fit_transform(X)

In [42]:
with open('data/X_train_tfidf.pkl', 'rb') as input:
    X_tfidf = pickle.load(input)
    
with open('data/y_train_spacy.pkl', 'rb') as input:
    y = pickle.load(input)

## Lets do Grid Search for Logistic Regression

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
# define the parameter values that should be searched
penalty = ['l1', 'l2']
C =  [0.1, 1, 10]

lr = LogisticRegression()
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(penalty=penalty, C=C)

In [45]:
# instantiate and fit the grid
grid_lr = GridSearchCV(lr, param_grid, cv=5, scoring='f1', return_train_score=False)
grid_lr.fit(X_tfidf, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [46]:
# examine the best model
print(grid_lr.best_score_)
print(grid_lr.best_params_)

0.8565290404960633
{'C': 10, 'penalty': 'l1'}


In [47]:
df_lr = pd.DataFrame({'Model_Name': 'LR', 'best_score':grid_lr.best_score_, 'best_params':grid_lr.best_params_})

## Lets do Grid Search for Random Forest

In [48]:
n_estimators = [50, 100, 150]
max_depth = [10, 25, 50]
min_samples_split = [2, 5, 10]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)

In [49]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, param_grid, cv=5, scoring='f1', return_train_score=False)
grid_rf.fit(X_tfidf, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100, 150], 'max_depth': [10, 25, 50], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [50]:
# run from here -->> 
# examine the best model
print(grid_rf.best_score_)
print(grid_rf.best_params_)

0.8365803431020266
{'max_depth': 50, 'min_samples_split': 5, 'n_estimators': 150}


In [51]:
df_rf = pd.DataFrame({'Model_Name': 'RF', 'best_score':grid_rf.best_score_, 'best_params':grid_rf.best_params_})

## Lets do Grid Search for Adaboost

In [52]:
from sklearn.ensemble import AdaBoostClassifier

In [53]:
n_estimators_ada = [50, 100]
algorithm_ada =  ['SAMME', 'SAMME.R']
param_grid_ada = dict(n_estimators = n_estimators_ada, algorithm=algorithm_ada)

In [54]:
# instantiate and fit the grid
ada = AdaBoostClassifier()
grid_ada = GridSearchCV(ada, param_grid_ada, cv=5, scoring='f1', return_train_score=False)
grid_ada.fit(X_tfidf, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100], 'algorithm': ['SAMME', 'SAMME.R']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [34]:
print(grid_ada.best_score_)
print(grid_ada.best_params_)

0.8743685274658509
{'algorithm': 'SAMME.R', 'n_estimators': 100}


In [32]:
df_ada = pd.DataFrame({'Model_Name': 'ADA', 'best_score':grid_ada.best_score_, 'best_params':grid_ada.best_params_})

### Lets do Grid Search for SVM

In [55]:
#start from here
from sklearn.svm import SVC

In [56]:
C_svm = [0.1, 1, 10]
kernel_svm =  ['rbf', 'linear']
param_grid_svm = dict(kernel = kernel_svm, C=C_svm)

In [57]:
# instantiate and fit the grid
svm = SVC()
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='f1', return_train_score=False)
grid_svm.fit(X_tfidf, y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kernel': ['rbf', 'linear'], 'C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [59]:
print(grid_svm.best_score_)
print(grid_svm.best_params_)

0.8498150642775673
{'C': 1, 'kernel': 'linear'}


In [60]:
df_svm = pd.DataFrame({'Model_Name': 'SVM', 'best_score':grid_svm.best_score_, 'best_params':grid_svm.best_params_})

In [61]:
#start from here
from sklearn.ensemble import GradientBoostingClassifier

In [62]:
n_estimators_gb = [10, 50, 100]
learning_rate_gb =  [0.1, 1, 10]
max_depth_gb = [10, 25, 50]
subsample_gb = [0.1, 0.5]
param_grid_gb = dict(n_estimators = n_estimators_gb, learning_rate=learning_rate_gb, max_depth=max_depth_gb, subsample=subsample_gb)

In [63]:
gb = GradientBoostingClassifier()

In [64]:
grid_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='f1', return_train_score=False)
grid_gb.fit(X_tfidf, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100], 'learning_rate': [0.1, 1, 10], 'max_depth': [10, 25, 50], 'subsample': [0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [67]:
print(grid_gb.best_score_)
print(grid_gb.best_params_)

0.8536186343480274
{'learning_rate': 0.1, 'max_depth': 50, 'n_estimators': 100, 'subsample': 0.5}


In [68]:
df_gb = pd.DataFrame({'Model_Name': 'GD', 'best_score':grid_gb.best_score_, 'best_params':grid_gb.best_params_})

In [69]:
df_all = pd.concat([df_lr, df_rf, df_ada, df_svm, df_gb], axis=0)
df_all.to_csv('gridsearch_result.csv', sep='\t', encoding='utf-8')