In [24]:
# Preprocessing & results----------------
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# nlp preprocessing
import spacy

# Models-------------------------
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
import sklearn.gaussian_process.kernels as kls
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

# for visualizing ---------------
import pydotplus
from sklearn import tree
from sklearn.externals.six import StringIO 
from IPython.display import Image, display
import seaborn as sns

# General purpose
import re
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Read features data

In [2]:
data = pd.read_pickle('./features/all_data_features_csv_17_20_new.pkl')
data = data.drop(['ref_latest'], axis=1)
print(data.shape)
data.head(5)

(4897, 23)


Unnamed: 0,id,n_authors,len_char_title,len_word_title,len_char_abstract,len_word_abstract,n_keywords,ref_len,num_recent_refs,contains_ml_keyword,...,avg_ref_length,num_sections,contains_appendix,fig_tab_eqn_count,contains_table,contains_figure,contains_equation,iclr_references,label,ref_latest_depth
2017_HJIY0E9ge,2017_HJIY0E9ge,3,70,12,823,121,1,22,12,1,...,250,13,0,25,1,1,1,0,Reject,2
2017_BkdpaH9ll,2017_BkdpaH9ll,5,41,5,1076,149,2,30,24,1,...,48,14,0,10,1,1,1,2,Reject,2
2020_rkecl1rtwB,2020_rkecl1rtwB,2,40,6,918,134,3,16,12,1,...,41,24,1,20,1,1,1,3,Accept,2
2019_HkgYmhR9KX,2019_HkgYmhR9KX,5,75,11,1761,266,4,36,24,1,...,48,16,1,26,1,1,1,0,Accept,1
2020_ryxW804FPH,2020_ryxW804FPH,3,68,8,750,112,4,31,20,1,...,33,10,1,17,1,1,1,2,Reject,2


In [3]:
print(len(data.columns), data.columns)

23 Index(['id', 'n_authors', 'len_char_title', 'len_word_title',
       'len_char_abstract', 'len_word_abstract', 'n_keywords', 'ref_len',
       'num_recent_refs', 'contains_ml_keyword', 'avg_sent_len',
       'abs_glove_avg', 'contains_sota', 'avg_ref_length', 'num_sections',
       'contains_appendix', 'fig_tab_eqn_count', 'contains_table',
       'contains_figure', 'contains_equation', 'iclr_references', 'label',
       'ref_latest_depth'],
      dtype='object')


In [4]:
data = data.sample(n=len(data), random_state=42)

## Models

In [5]:
X = data.drop(columns=['id', 'label'])
y = data['label']

X_normalized = X / X.max(axis=0)

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.30)

In [33]:
clf_dict = {
    'DecisionTree': {"model": DecisionTreeClassifier(random_state=42), "params": {'max_depth': list(range(2, 16))}},
    'RandomForest': {"model": RandomForestClassifier(random_state=42),
                     "params": {'n_estimators': list(range(5, 100, 5)), 'max_depth': list(range(2, 16))}},
    'LogisticR_L1': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l1'], 'solver': ['liblinear', 'saga']}},
    'LogisticR_L2': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}},
    'LogisticR': {"model": LogisticRegression(random_state=42, max_iter=1000),
                  "params": {'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}},
    'RidgeClf': {"model": RidgeClassifier(max_iter=1000), "params": {}},
    'SVC_linear': {"model": SVC(random_state=42), "params": {'kernel': ['linear']}},
    'SVC_poly': {"model": SVC(random_state=42),
                 "params": {'kernel': ['poly'], 'degree': [3, 4, 5], 'gamma': ['scale', 'auto']}},
    'SVC_others': {"model": SVC(random_state=42), "params": {'kernel': ['rbf', 'sigmoid'], 'gamma': ['scale', 'auto']}},
    'GussianNB': {"model": GaussianNB(), "params": {}},
    'KNN': {"model": KNeighborsClassifier(), "params": {'n_neighbors': list(range(3, 30))}},
    'GaussianProcessClf': {"model": GaussianProcessClassifier(random_state=42, kernel=kls.RBF()), "params": {}},
    'Bagging_SVC': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                            'base_estimator': [SVC(kernel='linear'),
                                                                                               SVC(kernel='poly',
                                                                                                   degree=3,
                                                                                                   gamma='scale')]}},
    'BaggingDT': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=2),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=5),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=10)]}},
    'AdaBoost': {"model": AdaBoostClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=2),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=5),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=10)]}},
    'ExtraTrees': {"model": ExtraTreesClassifier(random_state=42), "params": {'n_estimators': list(range(5, 105, 5)), 
                                                                              'max_depth': [2,5,10,15]}},
    'MLP_l1': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x,) for x in 
                                                                                          range(50, 600, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                   [True]}},
    'MLP_l2': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y) for x in 
                                                                                          range(50, 600, 100) 
                                                                                          for y in range(50, 360, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                                               [True]}},
#     'MLP_l2': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y, z) for x in 
#                                                                                           range(50, 600, 100) 
#                                                                                           for y in range(50, 600, 100)
#                                                                                           for z in range(50, 360, 100)], 
#                                                                   'activation': ['logistic', 'tanh', 'relu'],
#                                                                   'solver': ['adam', 'sgd'], 'early_stopping': 
#                                                                                                [True]}},
    }

In [31]:
model_results = pd.DataFrame()
model_results['Train_Accuracy'] = None
model_results['Test_Accuracy'] = None
model_results['best_params'] = None


for clf_name, clf in clf_dict.items():
    classifier = GridSearchCV(clf['model'], clf['params'], n_jobs=7)
    classifier.fit(X_train, y_train)
    best_model = classifier.best_estimator_
    #print(clf_name, classifier.best_score_, classifier.best_params_)
    
    y_predicted = classifier.predict(X_test)
    test_acc = accuracy_score(y_test, y_predicted)
    
    model_results.loc[clf_name, ['Train_Accuracy', 'Test_Accuracy', 'best_params']] = [classifier.best_score_, test_acc, classifier.best_params_]
    clsr = classification_report(y_test, y_predicted, target_names=["Reject", "Accept"])

GaussianProcessClf 0.6510067114093959 {}
SVC_linear 0.6498395097753137 {'kernel': 'linear'}
LogisticR_L2 0.6568427195798074 {'solver': 'liblinear', 'penalty': 'l2'}
MLP_l1 0.6548001167201634 {'activation': 'tanh', 'early_stopping': True, 'solver': 'adam', 'hidden_layer_sizes': (550,)}
SVC_others 0.6521739130434783 {'kernel': 'rbf', 'gamma': 'scale'}
MLP_l2 0.6603443244820543 {'activation': 'tanh', 'early_stopping': True, 'solver': 'adam', 'hidden_layer_sizes': (350, 250)}
AdaBoost 0.6717245404143566 {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best'), 'n_estimators': 5}
LogisticR 0.6527575138605194 {'solver': 'newton-cg', 'pe

In [32]:
model_results

Unnamed: 0,Train_Accuracy,Test_Accuracy,best_params
GaussianProcessClf,0.651007,0.656463,{}
SVC_linear,0.64984,0.644218,{'kernel': 'linear'}
LogisticR_L2,0.656843,0.661224,"{'solver': 'liblinear', 'penalty': 'l2'}"
MLP_l1,0.6548,0.656463,"{'activation': 'tanh', 'early_stopping': True,..."
SVC_others,0.652174,0.644898,"{'kernel': 'rbf', 'gamma': 'scale'}"
MLP_l2,0.660344,0.65034,"{'activation': 'tanh', 'early_stopping': True,..."
AdaBoost,0.671725,0.681633,{'base_estimator': DecisionTreeClassifier(clas...
LogisticR,0.652758,0.661905,"{'solver': 'newton-cg', 'penalty': 'none'}"
ExtraTrees,0.660928,0.655782,"{'n_estimators': 55, 'max_depth': 15}"
RidgeClf,0.655676,0.657823,{}


### Feature selection pipeline