In [14]:
import numpy as np
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer , TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score






In [15]:
df_train = pd.read_csv('data/train_kaggle.csv', sep=',', encoding='utf-8')

In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
id        20800 non-null int64
title     20242 non-null object
author    18843 non-null object
text      20761 non-null object
label     20800 non-null int64
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [17]:
df_train['text'].apply(type).unique()
# explore more why the text type is coming as float

print("Before removing float type: ",df_train.shape)
# only remove rows with text as float type

df_train = df_train.drop(df_train[df_train['text'].apply(type) == float].index)
print("After removing float type: ", df_train.shape)

Before removing float type:  (20800, 5)
After removing float type:  (20761, 5)


In [18]:
#change above X if we r using tokenize and other nlp process
#Dropping the Nan values and info
df_train.dropna(inplace=True)
X = df_train['text']
y = df_train['label']

# do the similar think on headline (author ?? or source)
# name these to test , validation
X_train,  X_test,  y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345)

In [19]:
tfidf_vectorizer  = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1,2), max_df= 0.85, min_df= 2)

In [None]:
#takes around 2-3 mins
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Lets do Grid Search for Logistic Regression

In [10]:
from sklearn.model_selection import GridSearchCV


In [39]:
# define the parameter values that should be searched
penalty = ['l1', 'l2']
C =  [0.1,1,10]

lr = LogisticRegression()
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(penalty=penalty, C=C)
print(param_grid)


{'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}


In [40]:
# instantiate and fit the grid
grid_lr = GridSearchCV(lr, param_grid, cv=5, scoring='f1', return_train_score=False)

In [41]:
#9.43 -> 2 min run time
grid_lr.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [42]:
# examine the best model
print(grid_lr.best_score_)
print(grid_lr.best_params_)

0.9603780218159874
{'C': 10, 'penalty': 'l2'}


In [44]:
df_lr = pd.DataFrame({'Model_Name': 'LR', 'best_score':grid_lr.best_score_, 'best_params':grid_lr.best_params_})

In [45]:
df_lr

Unnamed: 0,Model_Name,best_params,best_score
C,LR,10,0.960378
penalty,LR,l2,0.960378


In [None]:
#make a df of Model Name, F1 Score, and dict of parameters

## Lets do Grid Search for Random Forest

In [18]:
n_estimators = [50,100, 200]
max_depth = [10, 30, 60]
min_samples_split = [2, 5, 10]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
print(param_grid)

{'n_estimators': [50, 100, 200], 'max_depth': [10, 30, 60], 'min_samples_split': [2, 5, 10]}


In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [21]:
# instantiate and fit the grid
grid = GridSearchCV(rf, param_grid, cv=5, scoring='f1', return_train_score=False)

In [22]:
# 9.54 -> 11.54
grid.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100, 200], 'max_depth': [10, 30, 60], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1', verbose=0)

In [23]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)



0.8868332523664839
{'max_depth': 60, 'min_samples_split': 2, 'n_estimators': 100}


In [31]:
str(grid.best_params_)

"{'max_depth': 60, 'min_samples_split': 2, 'n_estimators': 100}"

In [35]:
df = pd.DataFrame({'Model_Name': 'RF', 'best_score':grid.best_score_, 'best_params':grid.best_params_})

In [53]:
df_new = pd.concat([df, df_lr], axis=0)

In [54]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, max_depth to penalty
Data columns (total 3 columns):
Model_Name     5 non-null object
best_params    5 non-null object
best_score     5 non-null float64
dtypes: float64(1), object(2)
memory usage: 160.0+ bytes


In [55]:
df_new.to_csv('gridsearch_result.csv', sep='\t', encoding='utf-8')

## Lets do Grid Search for Adaboost

In [6]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
import pandas as pd


In [11]:
n_estimators_ada = [50, 100, 150]
algorithm_ada =  ['SAMME', 'SAMME.R']
param_grid_ada = dict(n_estimators = n_estimators_ada, algorithm=algorithm_ada)

In [12]:
# instantiate and fit the grid
grid_ada = GridSearchCV(ada, param_grid_ada, cv=5, scoring='f1', return_train_score=False)

In [None]:
#2.40
grid_ada.fit(X_train_tfidf, y_train)

In [None]:
df_ada = pd.DataFrame({'Model_Name': 'ADA', 'best_score':grid_ada.best_score_, 'best_params':grid_ada.best_params_})

### Lets do Grid Search for SVM

In [1]:
from sklearn.svm import SVC

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
svm = SVC()

In [None]:
C_svm = [0.1, 1, 10]
kernel_svm =  ['rbf', 'linear']
param_grid_svm = dict(kernel = kernel_svm, C=C_svm)

In [None]:
# instantiate and fit the grid
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='f1', return_train_score=False)