#### Stratified 5X2 cross validation for selected 15 review studies from the TREC 2004 dataset

In [29]:
#import required library packages

import scipy
import numpy as np
import pandas as pd
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from scipy.stats.distributions import randint
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
#from sklearn.utils import shuffle

print ("Required Libraries loaded.")

Required Libraries loaded.


In [45]:
# Import the data and explore the first few rows
# Import the data and explore the first few rows

#load the dataset and stor in variable e.g. 'data_holder'
inhibitor  = #pd.read_csv("data\location\file.csv", sep=",")
#inhibitor = shuffle(inhibitor, random_state = 33)
header = inhibitor.columns.values
inhibitor.head()

Unnamed: 0,PMID,Label,ace,activity,acute,admission,allocated,ami,angina,angiotensinconverting,...,treatment,trend,trial,ventricular,versus,volume,within,year,years,york
0,7504126,E,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,8247194,E,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,8247909,E,1,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,8249842,E,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,8252860,E,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [44]:
# Convert to numpy array and check the dimensionality
npArray = np.array(inhibitor)
print(npArray.shape)

(2498L, 212L)


#### Split the data into input features, X, and outputs, y

In [32]:
# Split to input matrix X and class vector y
X = npArray[:,2:].astype(float)
y = npArray[:,1]

# Convert the categorical label to numeric values, and print the y frequencies
le = preprocessing.LabelEncoder()
y  = le.fit_transform(y)  #"E" = 0. "I" = 1

yFreq = scipy.stats.itemfreq(y)
print(yFreq)

[[   0 2320]
 [   1  178]]


In [33]:
#function to set sample weight
def func(x):
    x = np.where(x==0, 1, 4)
    return x

In [34]:
#SVM classifier using 5X2 CV 

svm_clf = SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,decision_function_shape=None, degree=3, 
              gamma='auto', kernel='rbf',max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)


seed = 67

from sklearn.cross_validation import StratifiedKFold


svm_precision = []
svm_recall = []
svm_f = []

for i in range(0, 5):
    
    skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=seed)

    for train_index, test_index in skf:
    
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        w = func(y_train)
        wt = func(y_test)
        
               
        svm_clf.fit(X_train, y_train, sample_weight = w)
        svm_pred = svm_clf.predict(X_test)
        s_prec = precision_score(y_test, svm_pred, labels=None, pos_label=1, average='binary', sample_weight=wt)
        s_recall = recall_score(y_test, svm_pred, labels=None, pos_label=1, average='binary', sample_weight=wt)
        s_f1 = f1_score(y_test, svm_pred, labels=None, pos_label=1, average='binary', sample_weight=wt)
        
        svm_precision.append(s_prec)
        svm_recall.append(s_recall)
        svm_f.append(s_f1)
#print (svm_precision)    
        
print ("SVM Cross validation results: ")
print ("           SVM ")
print ("Precision: %.2f "%(np.mean(svm_precision)))
print ("   Recall: %.2f " %(np.mean(svm_recall)))
print ("       F1: %.2f " %(np.mean(svm_f)))

[0.41999999999999998, 0.39884393063583817, 0.41999999999999998, 0.39884393063583817, 0.41999999999999998, 0.39884393063583817, 0.41999999999999998, 0.39884393063583817, 0.41999999999999998, 0.39884393063583817]
SVM Cross validation results: 
           SVM 
Precision: 0.41 
   Recall: 0.74 
       F1: 0.53 


In [35]:
#perceptron classifier using 5X2 CV
seed = 67
p_clf = Perceptron(penalty = 'l1',alpha = 0.001,fit_intercept = True, n_iter = 1, shuffle = True, verbose = 0, 
                   eta0 = 0.1, n_jobs = 1, random_state = 0, class_weight = 'balanced', warm_start = False)


from sklearn.cross_validation import StratifiedKFold


p_precision = []
p_recall = []
p_f = []


for i in range(0, 5):
   # seed = 21 + i
    skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=seed)

    for train_index, test_index in skf:
    
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        w = func(y_train)
        wt = func(y_test)
        
        p_clf.fit(X_train, y_train, sample_weight = w)
        p_pred = p_clf.predict(X_test)
        prec = precision_score(y_test, p_pred, labels=None, pos_label=1, average='binary', sample_weight=wt)
        recall = recall_score(y_test, p_pred, labels=None, pos_label=1, average='binary', sample_weight=wt)
        f1 = f1_score(y_test, p_pred, labels=None, pos_label=1, average='binary', sample_weight=wt)
        
        p_precision.append(prec)
        p_recall.append(recall)
        p_f.append(f1)
        
            
#print (p_precision)
        
print ("Perceptron Cross validation results: ")
print ("        Perceptron    ")
print ("Precision: %.2f     " %(np.mean(p_precision)))
print ("   Recall: %.2f     " %(np.mean(p_recall)))
print ("       F1: %.2f     " %(np.mean(p_f)))

[0.50659630606860162, 0.39267886855241263, 0.50659630606860162, 0.39267886855241263, 0.50659630606860162, 0.39267886855241263, 0.50659630606860162, 0.39267886855241263, 0.50659630606860162, 0.39267886855241263]
Perceptron Cross validation results: 
        Perceptron    
Precision: 0.45     
   Recall: 0.60     
       F1: 0.51     
