In [22]:
import numpy as np
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer , TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score

In [13]:
df_train = pd.read_pickle('data/data_train.pkl')

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8744 entries, 4075 to 3378
Data columns (total 7 columns):
content       8744 non-null object
id            8744 non-null object
label         8744 non-null int64
media-type    8744 non-null object
published     8744 non-null object
source        8744 non-null object
title         8744 non-null object
dtypes: int64(1), object(6)
memory usage: 546.5+ KB


In [15]:
#change above X if we r using tokenize and other nlp process
X = df_train['content']
y = df_train['label']

In [16]:
tfidf_vectorizer  = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1,2), max_df= 0.85, min_df= 2)
X_tfidf = tfidf_vectorizer.fit_transform(X)

## Lets do Grid Search for Logistic Regression

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
# define the parameter values that should be searched
penalty = ['l1', 'l2']
C =  [0.1, 1, 10]

lr = LogisticRegression()
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(penalty=penalty, C=C)

{'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}


In [20]:
# instantiate and fit the grid
grid_lr = GridSearchCV(lr, param_grid, cv=5, scoring='f1', return_train_score=False)
grid_lr.fit(X_tfidf, y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [21]:
# examine the best model
print(grid_lr.best_score_)
print(grid_lr.best_params_)

0.8940456874207461
{'C': 10, 'penalty': 'l1'}


In [44]:
df_lr = pd.DataFrame({'Model_Name': 'LR', 'best_score':grid_lr.best_score_, 'best_params':grid_lr.best_params_})

In [45]:
df_lr

Unnamed: 0,Model_Name,best_params,best_score
C,LR,10,0.960378
penalty,LR,l2,0.960378


## Lets do Grid Search for Random Forest

In [24]:
n_estimators = [50,100, 200]
max_depth = [10, 30, 60]
min_samples_split = [2, 5, 10]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, param_grid, cv=5, scoring='f1', return_train_score=False)
grid_rf.fit(X_tfidf, y)

  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


In [23]:
# examine the best model
print(grid_rf.best_score_)
print(grid_rf.best_params_)

0.8868332523664839
{'max_depth': 60, 'min_samples_split': 2, 'n_estimators': 100}


In [35]:
df_rf = pd.DataFrame({'Model_Name': 'RF', 'best_score':grid.best_score_, 'best_params':grid_rd.best_params_})

In [53]:
df_new = pd.concat([df_rf, df_lr], axis=0)

In [55]:
df_new.to_csv('gridsearch_result.csv', sep='\t', encoding='utf-8')

## Lets do Grid Search for Adaboost

In [6]:
from sklearn.ensemble import AdaBoostClassifier

In [11]:
n_estimators_ada = [50, 100, 150]
algorithm_ada =  ['SAMME', 'SAMME.R']
param_grid_ada = dict(n_estimators = n_estimators_ada, algorithm=algorithm_ada)

In [12]:
# instantiate and fit the grid
ada = AdaBoostClassifier()
grid_ada = GridSearchCV(ada, param_grid_ada, cv=5, scoring='f1', return_train_score=False)
grid_ada.fit(X_tfidf, y)

In [None]:
df_ada = pd.DataFrame({'Model_Name': 'ADA', 'best_score':grid_ada.best_score_, 'best_params':grid_ada.best_params_})

### Lets do Grid Search for SVM

In [1]:
from sklearn.svm import SVC

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
svm = SVC()

In [None]:
C_svm = [0.1, 1, 10]
kernel_svm =  ['rbf', 'linear']
param_grid_svm = dict(kernel = kernel_svm, C=C_svm)

In [None]:
# instantiate and fit the grid
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='f1', return_train_score=False)