In [2]:
import pandas as pd
import numpy as np
import pickle

# Import Dataframe

In [3]:
# Load pickled dataframe

file = 'pickles/features_04_29.pkl'

with open(file, 'rb') as f:
    df_features = pickle.load(f)

In [4]:
df_features.head()

Unnamed: 0,source,readability_SMOG,avg_sent_len,Num_Sentences,sd_sent_len,norm_stop_freq,norm_punct_freq,norm_funct_freq
0,economist,11.2,20.666667,3,11.841546,0.421875,0.234375,0.390625
1,economist,10.7,17.0,4,3.391165,0.5,0.132353,0.470588
2,economist,5.7,13.8,5,5.844656,0.422535,0.197183,0.43662
3,economist,11.6,17.8,5,7.62627,0.422222,0.255556,0.411111
4,economist,13.6,18.666667,3,7.408704,0.35,0.316667,0.333333


# A bunch of classifiers

In [27]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support,accuracy_score,f1_score,precision_score,recall_score
from xgboost import XGBClassifier

In [28]:
# prepare models
models = []
models.append(('LR', LogisticRegression(multi_class='multinomial',solver='lbfgs')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('XGBoost',XGBClassifier()))
#models.append(('SVM', SVC(kernel = 'linear')))

In [11]:
# X -> features, y -> label
X=df_features[['readability_SMOG','avg_sent_len','sd_sent_len','norm_stop_freq','norm_punct_freq','norm_funct_freq']]
y=df_features.source
print(len(X))

52922


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state = 0)

val_accuracy_all={}
val_fscore_all={}
val_precision_all={}
val_recall_all={}

train_accuracy_all={}
train_fscore_all={}
train_precision_all={}
train_recall_all={}

for name,model in models:
    print('Training')
    clf=model.fit(X_train,y_train)
    print('Testing')
    y_pred=clf.predict(X_test)
    print(name)
    val_precision_all[name]=precision_score(y_test,y_pred,average='macro')
    val_recall_all[name]=recall_score(y_test,y_pred,average='macro')
    val_fscore_all[name]=f1_score(y_test, y_pred, average='macro')
    val_accuracy_all[name]=accuracy_score(y_test, y_pred)

    y_train_pred=clf.predict(X_train)
    train_precision_all[name]=precision_score(y_train,y_train_pred,average='macro')
    train_recall_all[name]=recall_score(y_train,y_train_pred,average='macro')
    train_fscore_all[name]=f1_score(y_train,y_train_pred, average='macro')
    train_accuracy_all[name]=accuracy_score(y_train,y_train_pred)

val_classifier_results=pd.DataFrame({'Accuracy':pd.Series(val_accuracy_all),'Precision':pd.Series(val_precision_all),'Recall':pd.Series(val_recall_all),'F1_Score':pd.Series(val_fscore_all)}) 

train_classifier_results=pd.DataFrame({'Accuracy':pd.Series(train_accuracy_all),'Precision':pd.Series(train_precision_all),'Recall':pd.Series(train_recall_all),'F1_Score':pd.Series(train_fscore_all)})  



Training
Testing
LR
Training
Testing
LDA
Training
Testing
KNN
Training
Testing
CART
Training
Testing
NB
Training
Testing
XGBoost


In [30]:
print(train_classifier_results)

         Accuracy  F1_Score  Precision    Recall
CART     0.998380  0.998357   0.998293  0.998425
KNN      0.558753  0.553917   0.562523  0.564764
LDA      0.502470  0.488085   0.497672  0.490090
LR       0.476097  0.463192   0.470995  0.462512
NB       0.456256  0.442735   0.457185  0.461091
XGBoost  0.535025  0.524606   0.529712  0.524329


In [31]:
print(val_classifier_results)

         Accuracy  F1_Score  Precision    Recall
CART     0.407886  0.402823   0.403393  0.402669
KNN      0.359829  0.353328   0.354126  0.363224
LDA      0.505007  0.489856   0.497572  0.491886
LR       0.481262  0.467167   0.475677  0.466879
NB       0.463816  0.451440   0.464418  0.468654
XGBoost  0.526800  0.515957   0.520347  0.515760


# XGBOOST

In [25]:
#Grid Search. Needs to be completed
X_train_XG, X_test_XG, \
y_train_XG, y_test_XG = train_test_split(X, y, test_size=0.30, random_state=42)

# Param grid to optimize across
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators': [100,200,300,500,1000,1500,2000]
        }

# stratified k fold object
skf = model_selection.StratifiedKFold(n_splits=2, shuffle = True, random_state = 1001)

# instantiate
xgb = xgboost.XGBClassifier()

# optimized to f1
grid_search = GridSearchCV(xgb,
                           param_grid=params,
                           scoring='accuracy',
                           n_jobs=4,
                           cv=skf.split(X,y),
                           verbose=3,
                           refit=False)

In [26]:
print(grid_search)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x0000021F1C434F10>,
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5], 'learning_rate': [0.05, 0.1, 0.2], 'n_estimators': [100, 200, 300, 500, 1000, 1500, 2000]},
       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',
       scoring='accuracy', verbose=3)


# SVM

In [None]:
# Grid search:

# vary kernels
# vary 'C'

# Record testing error AND training error

In [None]:
#SVM with grid search. Needs to be completed
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['accuracy','precision_macro', 'recall_macro','f1_macro']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s' % score)
    clf.fit(X_train, y_train)
    
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)



# Logistic Regression

In [None]:
# Try different regularization techniques

# If it's not overfitting, maybe try adding in some interaction terms?

# Naive Bayes