In [5]:
import pandas as pd
import numpy as np
import pickle

In [6]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Import Dataframe

In [7]:
# Load pickled dataframe

file = 'pickles/features_04_30.pkl'

with open(file, 'rb') as f:
    df_features = pickle.load(f)

In [8]:
df_features.head()

Unnamed: 0,"(., '')","(PRP, VBD)",``,"(,, '')","('', PRP, VBD)","(,, ADP)","(NOUN, VBD)",VBD,"(VBD, .)","(VBD, ,)",...,"('', PRP)","(,, VBD)",source,readability_SMOG,avg_sent_len,Num_Sentences,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,0.0,0.0,0.0,0.0,0.0,0.013333,0.013333,0.013158,0.0,0.0,...,0.0,0.0,economist,11.2,20.666667,3,11.841546,0.421875,0.234375,0.390625
1,0.0,0.0,0.0,0.0,0.0,0.013514,0.0,0.013333,0.0,0.0,...,0.0,0.0,economist,10.7,17.0,4,3.391165,0.5,0.132353,0.470588
2,0.0,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,...,0.0,0.0,economist,5.7,13.8,5,5.844656,0.422535,0.197183,0.43662
3,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,...,0.0,0.0,economist,11.6,17.8,5,7.62627,0.422222,0.255556,0.411111
4,0.0,0.0,0.013889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,economist,13.6,18.666667,3,7.408704,0.362069,0.327586,0.344828


# A bunch of classifiers

In [9]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support,accuracy_score,f1_score,precision_score,recall_score
from xgboost import XGBClassifier



In [11]:
# prepare models
models = []
models.append(('LR', LogisticRegression(multi_class='multinomial',solver='lbfgs')))
#models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
#models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('XGBoost',XGBClassifier( n_estimators=1000)))
#models.append(('SVM', SVC(kernel = 'linear')))

In [12]:
# X -> features, y -> label
X=df_features.loc[:, df_features.columns != 'source']

#X=df_features[['readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]
y=df_features.source
print(len(X))

52812


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state = 0)

val_accuracy_all={}
val_fscore_all={}
val_precision_all={}
val_recall_all={}

train_accuracy_all={}
train_fscore_all={}
train_precision_all={}
train_recall_all={}

for name,model in models:
    
    print('Training ', name)
    clf=model.fit(X_train,y_train)
    
    print('Getting testing error ...')
    y_pred=clf.predict(X_test)
    
    val_precision_all[name]=precision_score(y_test,y_pred,average='macro')
    val_recall_all[name]=recall_score(y_test,y_pred,average='macro')
    val_fscore_all[name]=f1_score(y_test, y_pred, average='macro')
    val_accuracy_all[name]=accuracy_score(y_test, y_pred)

    print('Getting training error ...')
    y_train_pred=clf.predict(X_train)
    
    train_precision_all[name]=precision_score(y_train,y_train_pred,average='macro')
    train_recall_all[name]=recall_score(y_train,y_train_pred,average='macro')
    train_fscore_all[name]=f1_score(y_train,y_train_pred, average='macro')
    train_accuracy_all[name]=accuracy_score(y_train,y_train_pred)
    
    print('\n')

val_classifier_results=pd.DataFrame({'Accuracy':pd.Series(val_accuracy_all),'Precision':pd.Series(val_precision_all),'Recall':pd.Series(val_recall_all),'F1_Score':pd.Series(val_fscore_all)}) 

train_classifier_results=pd.DataFrame({'Accuracy':pd.Series(train_accuracy_all),'Precision':pd.Series(train_precision_all),'Recall':pd.Series(train_recall_all),'F1_Score':pd.Series(train_fscore_all)})  



Training  LR
Getting testing error ...
Getting training error ...


Training  KNN
Getting testing error ...
Getting training error ...


Training  NB
Getting testing error ...
Getting training error ...


Training  XGBoost
Getting testing error ...
Getting training error ...




In [14]:
print(train_classifier_results)

         Accuracy  F1_Score  Precision    Recall
KNN      0.573171  0.566496   0.574451  0.578462
LR       0.513255  0.497557   0.500262  0.501655
NB       0.427586  0.410950   0.494164  0.448846
XGBoost  0.693681  0.687154   0.685473  0.690629


In [15]:
print(val_classifier_results)

         Accuracy  F1_Score  Precision    Recall
KNN      0.382984  0.374119   0.373405  0.385623
LR       0.513065  0.495733   0.497527  0.500285
NB       0.429500  0.412919   0.492705  0.450919
XGBoost  0.629702  0.621602   0.619203  0.625379


# SVM

In [None]:
# Grid search:

# vary kernels
# vary 'C'

# Record testing error AND training error

In [None]:
#SVM with grid search. Needs to be completed

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['accuracy','precision_macro', 'recall_macro','f1_macro']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s' % score)
    clf.fit(X_train, y_train)
    
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)



# Logistic Regression

In [None]:
# Try different regularization techniques

# If it's not overfitting, maybe try adding in some interaction terms?

# XGBOOST

In [19]:
params = {'min_child_weight': [1, 5, 10],
        'gamma': [0,0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.025, 0.05, 0.1],
        }

skf = StratifiedKFold(n_splits=2, shuffle = True, random_state = 1001)

xgb = XGBClassifier(n_estimators=300, 
                    objective= 'multi:softmax', 
                    seed=27)

rsearch1 = RandomizedSearchCV(estimator = xgb,
                                    param_distributions = params,
                                    n_iter=50,
                                    scoring='accuracy',
                                    #n_jobs=2,
                                    verbose=3,
                                    cv=skf.split(X_train,y_train),
                                    return_train_score=True)

rsearch1.fit(X_train,y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] subsample=0.8, min_child_weight=5, max_depth=5, learning_rate=0.1, gamma=2, colsample_bytree=0.6 
[CV]  subsample=0.8, min_child_weight=5, max_depth=5, learning_rate=0.1, gamma=2, colsample_bytree=0.6, score=0.6264538815255613, total=   9.1s
[CV] subsample=0.8, min_child_weight=5, max_depth=5, learning_rate=0.1, gamma=2, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.5s remaining:    0.0s


[CV]  subsample=0.8, min_child_weight=5, max_depth=5, learning_rate=0.1, gamma=2, colsample_bytree=0.6, score=0.6203538386625548, total=  10.5s
[CV] subsample=0.6, min_child_weight=5, max_depth=3, learning_rate=0.05, gamma=5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.4s remaining:    0.0s


[CV]  subsample=0.6, min_child_weight=5, max_depth=3, learning_rate=0.05, gamma=5, colsample_bytree=0.6, score=0.6119556397078713, total=   6.8s
[CV] subsample=0.6, min_child_weight=5, max_depth=3, learning_rate=0.05, gamma=5, colsample_bytree=0.6 
[CV]  subsample=0.6, min_child_weight=5, max_depth=3, learning_rate=0.05, gamma=5, colsample_bytree=0.6, score=0.6105069523345777, total=   6.3s
[CV] subsample=1.0, min_child_weight=5, max_depth=4, learning_rate=0.1, gamma=1, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=4, learning_rate=0.1, gamma=1, colsample_bytree=0.8, score=0.6245604544225047, total=   7.5s
[CV] subsample=1.0, min_child_weight=5, max_depth=4, learning_rate=0.1, gamma=1, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=4, learning_rate=0.1, gamma=1, colsample_bytree=0.8, score=0.618568414218471, total=   8.2s
[CV] subsample=1.0, min_child_weight=10, max_depth=5, learning_rate=0.05, gamma=0, colsample_bytree=0.8 
[CV]  su

[CV]  subsample=1.0, min_child_weight=10, max_depth=3, learning_rate=0.2, gamma=2, colsample_bytree=0.8, score=0.6153221879565006, total=   5.3s
[CV] subsample=0.8, min_child_weight=10, max_depth=3, learning_rate=0.1, gamma=5, colsample_bytree=0.8 
[CV]  subsample=0.8, min_child_weight=10, max_depth=3, learning_rate=0.1, gamma=5, colsample_bytree=0.8, score=0.6186637814444144, total=   6.2s
[CV] subsample=0.8, min_child_weight=10, max_depth=3, learning_rate=0.1, gamma=5, colsample_bytree=0.8 
[CV]  subsample=0.8, min_child_weight=10, max_depth=3, learning_rate=0.1, gamma=5, colsample_bytree=0.8, score=0.6154844992695991, total=   6.3s
[CV] subsample=0.8, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.8, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=0.5, colsample_bytree=1.0, score=0.6256965106843386, total=   9.3s
[CV] subsample=0.8, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=0.5, colsample_bytree=1.0 
[

[CV]  subsample=0.6, min_child_weight=1, max_depth=3, learning_rate=0.2, gamma=0.5, colsample_bytree=1.0, score=0.6272653502840141, total=   7.6s
[CV] subsample=0.6, min_child_weight=1, max_depth=3, learning_rate=0.2, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.6, min_child_weight=1, max_depth=3, learning_rate=0.2, gamma=0.5, colsample_bytree=1.0, score=0.619596385868095, total=   7.6s
[CV] subsample=1.0, min_child_weight=1, max_depth=5, learning_rate=0.2, gamma=2, colsample_bytree=0.6 
[CV]  subsample=1.0, min_child_weight=1, max_depth=5, learning_rate=0.2, gamma=2, colsample_bytree=0.6, score=0.6251555315120367, total=   7.0s
[CV] subsample=1.0, min_child_weight=1, max_depth=5, learning_rate=0.2, gamma=2, colsample_bytree=0.6 
[CV]  subsample=1.0, min_child_weight=1, max_depth=5, learning_rate=0.2, gamma=2, colsample_bytree=0.6, score=0.6167829897743873, total=   7.1s
[CV] subsample=0.6, min_child_weight=10, max_depth=4, learning_rate=0.2, gamma=0, colsample_bytree=0.6 
[CV]  

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 14.2min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x1a0fecaaf0>,
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'min_child_weight': [1, 5, 10], 'gamma': [0, 0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5], 'learning_rate': [0.05, 0.1, 0.2]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='accuracy', verbose=3)

In [22]:
#Observe The best params and substitute please?
rsearch1.best_params_, rsearch1.best_score_

({'subsample': 0.6,
  'min_child_weight': 5,
  'max_depth': 5,
  'learning_rate': 0.05,
  'gamma': 0.5,
  'colsample_bytree': 0.8},
 0.62478359662410732)

## If best param is largest or smallest tried, change range in grid search

In [29]:
params2 = {'n_estimators': [600, 1000, 2000]
        }

xgb2 = XGBClassifier(objective= 'multi:softmax',
              seed=27,
              #Add best params from gsearch1 here:
              subsample= 0.6,
              min_child_weight= 5,
              max_depth= 5,
              learning_rate= 0.05,
              gamma= 0.5,
              colsample_bytree= 0.8
             )

gsearch2 = GridSearchCV(estimator = xgb2, 
                        param_grid = params2, 
                        scoring='accuracy',
                        cv=skf.split(X_train,y_train))

In [None]:
gsearch2.fit(X_train,y_train)

In [None]:
gsearch2.best_params_, gsearch2.best_score_