In [1]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.cross_validation import KFold, train_test_split

from sklearn.datasets import load_svmlight_file



### Train Baseline models based on Event Count

In [2]:
without_time = False

if without_time:
    N_FEAT = 5475
    svmlight_file = "./cleaned_data/features_svmlight_eventCounts.train"
else:
    N_FEAT = 5484
    svmlight_file = "./cleaned_data/features_svmlight_eventCounts_wt.train"

In [3]:
def get_data_from_svmlight(svmlight_file):
    data_train = load_svmlight_file(svmlight_file,n_features=N_FEAT)
    X_train = data_train[0]
    Y_train = data_train[1]
    return X_train, Y_train

In [4]:
RANDOM_STATE = 6250

In [5]:
X, Y = get_data_from_svmlight(svmlight_file)


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_STATE)

### Models

Logistic, SVM, Decision Tree, Random Forest

In [7]:
#input: X_train, Y_train
#output: Y_pred
def logistic_regression_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use default params for the classifier
    log_reg = LogisticRegression(random_state=RANDOM_STATE)
    log_reg.fit(X_train, Y_train)
    Y_pred = log_reg.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def svm_pred(X_train, Y_train, X_test):
    #train a SVM classifier using X_train and Y_train. Use this to predict labels of X_train
    #use default params for the classifier
    lin_svc = LinearSVC(random_state=RANDOM_STATE)
    lin_svc.fit(X_train, Y_train)
    Y_pred = lin_svc.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def decisionTree_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use max_depth as 5
    dec_tree = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
    dec_tree.fit(X_train, Y_train)
    Y_pred = dec_tree.predict(X_test)

    return Y_pred


#input: X_train, Y_train
#output: Y_pred
def randomForest_pred(X_train, Y_train, X_test):
    #train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_train
    #use max_depth as 5
    rand_forest = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rand_forest.fit(X_train, Y_train)
    Y_pred = rand_forest.predict(X_test)

    return Y_pred

In [8]:
def classification_metrics(Y_pred, Y_true):
    #NOTE: It is important to provide the output in the same order
    accuracy = accuracy_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    fscore = f1_score(Y_true, Y_pred)

    return accuracy, auc, precision, recall, fscore

In [9]:
kfold = KFold(np.shape(X_train)[0], 10, random_state=RANDOM_STATE)

### Logistic Regression

In [10]:
def cv_logistic(X,Y,kfold):
    
    acc_list, auc_list = [], []

    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = logistic_regression_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [11]:
print cv_logistic(X_train,Y_train,kfold)

(0.92153019023986771, 0.90641778836290832)


### SVM

In [12]:
def cv_svm(X,Y,kfold):

    acc_list, auc_list = [], []

    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = svm_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [13]:
print cv_svm(X_train,Y_train,kfold)

(0.91702646815550037, 0.90582425305602621)


### Decision Tree

In [14]:
def cv_decision_tree(X,Y,kfold):
    acc_list, auc_list = [], []

    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = decisionTree_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [15]:
print cv_decision_tree(X_train,Y_train,kfold)

(0.93631100082712992, 0.92979315912191329)


### Random Forest

In [16]:
def cv_random_forest(X,Y,kfold):
    acc_list, auc_list = [], []

    for train, test in kfold:
        X_train, Y_train, X_fold, Y_true = X[train], Y[train], X[test], Y[test]

        Y_pred = randomForest_pred(X_train, Y_train, X_fold)

        metrics = classification_metrics(Y_pred, Y_true)
        acc_list.append(metrics[0])
        auc_list.append(metrics[1])

    acc, auc = np.mean(acc_list), np.mean(auc_list)

    return acc, auc

In [17]:
print cv_random_forest(X_train,Y_train,kfold)

(0.84438378825475602, 0.77548603541535654)


### Run the test data on the models

In [18]:
#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName,Y_pred,Y_true):
    print "______________________________________________"
    print "Classifier: "+classifierName
    acc, auc_, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
    print "Accuracy: "+str(acc)
    print "AUC: "+str(auc_)
    print "Precision: "+str(precision)
    print "Recall: "+str(recall)
    print "F1-score: "+str(f1score)
    print "______________________________________________"
    print ""

In [19]:
display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)

display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)

display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)

display_metrics("Random Forest",randomForest_pred(X_train,Y_train,X_test),Y_test)

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.922879177378
AUC: 0.916920095408
Precision: 0.872180451128
Recall: 0.899224806202
F1-score: 0.885496183206
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.904884318766
AUC: 0.899552772809
Precision: 0.838235294118
Recall: 0.883720930233
F1-score: 0.860377358491
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.917737789203
AUC: 0.915026833631
Precision: 0.85401459854
Recall: 0.906976744186
F1-score: 0.87969924812
______________________________________________

______________________________________________
Classifier: Random Forest
Accuracy: 0.820051413882
AUC: 0.744305307096
Precision: 0.893333333333
Recall: 0.519379844961
F1-score: 0.656862745098
______________________________________________

