In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, cohen_kappa_score as kappa, confusion_matrix, roc_auc_score, roc_curve, f1_score, make_scorer

from helpers import PICKLE_DIR, pickle_object
import pandas as pd

def nb_classifier():
    skf = StratifiedKFold(4)
    gnb = GaussianNB()
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
        score = gnb.score(X_test, y_test)

        print('Score: {}'.format(score))
        print('Classification report for fold {}:'.format(i))
        print(classification_report(y_test, y_pred))
        print('\n---\n')


    X_test_final = pd.read_pickle(PICKLE_DIR / 'testing_X_lsa_6000_components.gz', 'gzip')
    y_test_final = pd.read_pickle(PICKLE_DIR / 'testing_labels.gz', 'gzip')
    # y_test_final = binarizer.transform(y_test_final)
    y_pred_final = gnb.predict(X_test_final)

    score = gnb.score(X_test_final, y_test_final)
    print('Score: {}'.format(score))

    print('Final Classification Report:')
    print(classification_report(y_test_final, y_pred_final))
    return

#############################################################

#######SVM classifier ###########

def svm_clf():

    skf = StratifiedKFold(4)
    svm = SVC(kernel= 'rbf', gamma= 1e-2, C= 0.001)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)
        score = svm.score(X_test, y_test)

        print('Score: {}'.format(score))
        print('Classification report for fold {}:'.format(i))
        print(classification_report(y_test, y_pred))
        print('\n---\n')


    X_test_final = pd.read_pickle(PICKLE_DIR / 'testing_X_lsa_6000_components.gz', 'gzip')
    y_test_final = pd.read_pickle(PICKLE_DIR / 'testing_labels.gz', 'gzip')
    y_pred_final = svm.predict(X_test_final)

    score = svm.score(X_test_final, y_test_final)
    print('Score: {}'.format(score))

    print('Final Classification Report:')
    print(classification_report(y_test_final, y_pred_final))
    return


#############################################################

def svm_clf_2():

        tuned_parameters =  [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                             'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                            {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                             'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000] },
                             {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                            {'kernel': ['poly'], 'degree': [2, 3, 4, 5, 6, 7, 8]} ]              

        custom_scorer = make_scorer(f1_score, greater_is_better=True)
        clf = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring=custom_scorer)
        clf.fit(X, y)
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print('Training F1_score')
        print(clf.best_score_)
        print(clf.best_estimator_)
        print()
        print('****Results on Test Dataset****')
        X_test_final = pd.read_pickle(PICKLE_DIR / 'testing_X_lsa_6000_components.gz', 'gzip')
        y_test_final = pd.read_pickle(PICKLE_DIR / 'testing_labels.gz', 'gzip')
        svm_pred=clf.predict(X_test_final)
        score = clf.score(X_test_final, y_test_final)

#############################################################

def RandomForest(X, y, n_estimators, max_depth, random_state, criterion, max_features ="auto"):

    sc = StandardScaler()
    sc = StandardScaler()
    X = sc.fit_transform(X)  
   
    skf = StratifiedKFold(4)
    #svm = SVC(kernel= 'rbf', gamma= 1e-2, C= 0.001)
    
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fitting Random Forest Classifier to the Training set
        rf = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth, max_features=max_features, random_state=random_state, criterion = criterion)
        rf.fit(X, y)
        y_pred = rf.predict(X_test)
        score = rf.score(X_test, y_test)

        print('Score: {}'.format(score))
        print('Classification report for fold {}:'.format(i))
        print(classification_report(y_test, y_pred))
        print('\n---\n')


    X_test_final = pd.read_pickle(PICKLE_DIR / 'testing_X_lsa_6000_components.gz', 'gzip')
    y_test_final = pd.read_pickle(PICKLE_DIR / 'testing_labels.gz', 'gzip')
    # y_test_final = binarizer.transform(y_test_final)
    y_pred_final = rf.predict(X_test_final)

    score = svm.score(X_test_final, y_test_final)
    print('Score: {}'.format(score))

    print('Final Classification Report:')
    print(classification_report(y_test_final, y_pred_final))

#############################################################
def ensemble_method():
    
    model_1 = GaussianNB()
    
    model_2 = RandomForestClassifier(n_estimators = 100, max_depth=25, max_features=20, random_state=0, criterion = 'entropy')
    
    model_3 = SVC(kernel= 'rbf', gamma= 1e-2, C= 0.001)
    
    model = VotingClassifier(estimators=[('nb', model_1), ('rf', model_2), ('svc', model_3)], voting='hard')
    
    model.fit(X,y)
    model.score(x_test,y_test)
    
    X_test_final = pd.read_pickle(PICKLE_DIR / 'testing_X_lsa_10000_feat_5000_comp.gz', 'gzip')
    y_test_final = pd.read_pickle(PICKLE_DIR / 'testing_labels_.gz', 'gzip')
    # y_test_final = binarizer.transform(y_test_final)
    #y_pred_final = model.predict(X_test_final)
    print(model.score(X_test_final,y_test_final))

#############################################################

if __name__ == '__main__':
    binarizer = LabelBinarizer()
        
    X = pd.read_pickle(PICKLE_DIR / 'training_X_lsa_10000_feat_5000_comp.gz', 'gzip')
    y = pd.read_pickle(PICKLE_DIR / 'training_labels_.gz', 'gzip')
    # y = binarizer.fit_transform(y)
    
    #nb_classifier()
    #svm_clf()
    #svm_clf_2()
    #RandomForest(X, y, n_estimators = 100, max_depth=25, max_features=20, random_state=0, criterion = 'entropy')
    #ensemble_method()
    
    

In [2]:
X_test_final = pd.read_pickle(PICKLE_DIR / 'testing_X_lsa_10000_feat_5000_comp.gz', 'gzip')

In [3]:
y_test_final = pd.read_pickle(PICKLE_DIR / 'testing_labels_.gz', 'gzip')

In [4]:
X.shape

(28368, 5000)

In [5]:
y.shape

(28368,)

In [6]:
X_test_final.shape

(7093, 5000)

In [20]:
h= pd.DataFrame(data= y)

In [24]:
h.columns = ['class']

In [30]:
h[h['class'] != 'biorxiv'].shape

(12798, 1)

In [34]:
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
count,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,...,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0,28368.0
mean,0.175745,-0.005191,-0.004291,0.005357,-0.002912,0.001575,0.001884,0.0002,0.003118,0.000297,...,-1e-05,1.6e-05,5e-06,-2e-06,9e-06,-2e-06,-7e-06,-2e-06,-8e-06,-9e-06
std,0.048868,0.084375,0.078064,0.073595,0.067276,0.063398,0.062225,0.059749,0.057358,0.054861,...,0.00529,0.005293,0.005288,0.005276,0.005273,0.005271,0.005268,0.005262,0.00526,0.005253
min,-0.0,-0.267259,-0.265518,-0.265606,-0.266704,-0.224812,-0.219049,-0.219831,-0.237529,-0.243835,...,-0.027233,-0.028295,-0.027662,-0.026014,-0.027537,-0.030038,-0.030919,-0.027564,-0.026922,-0.027349
25%,0.144098,-0.05767,-0.055506,-0.040702,-0.046742,-0.035945,-0.036833,-0.041371,-0.031041,-0.028251,...,-0.003271,-0.003233,-0.003268,-0.003236,-0.003264,-0.003267,-0.003253,-0.003245,-0.003242,-0.003244
50%,0.175456,0.001357,-0.004778,0.016682,-0.003027,0.002693,-0.00029,-0.00214,0.003412,-0.000424,...,-4.2e-05,0.0,-2.6e-05,-2.7e-05,2.6e-05,-4e-05,6.4e-05,-1.8e-05,-4.7e-05,1.6e-05
75%,0.208312,0.048426,0.044603,0.051117,0.03933,0.039014,0.037922,0.03983,0.035582,0.028923,...,0.003277,0.003211,0.003228,0.003199,0.003312,0.00322,0.003252,0.00323,0.003185,0.003206
max,0.35219,0.352123,0.269807,0.292044,0.285171,0.382548,0.377717,0.235834,0.266646,0.280268,...,0.027778,0.035172,0.028668,0.02798,0.034246,0.029848,0.030978,0.034993,0.034062,0.032079
