# Stratified k-Fold Cross Validation


### Machine Learning Models:
SVM (linear)
SVM (RBF)
Naive Bayes


### Description
These feature extraction methods and machine learning models are combined and ran using k-fold cross validation, with values of k = 2, 3, and 10. Feature statistics are also extracted for further analysis.

Import the required packages.

In [1]:
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import pprint
from ast import literal_eval

Read the dataset, and create the data and label arrays.

In [24]:
corpus = pd.read_csv('gm_test_liwc_all.csv');

labels = []
for news in corpus['class']:
    labels.append(news)

del corpus['class']
data = corpus.values.tolist()

print "Total number of documents: " + str(len(data))

Total number of documents: 6294


## Training and Validation

In [26]:
#Stratified 10-cross fold validation with SVM and Multinomial NB 
      
kvalues = [2, 3, 10]
count = float(len(data2))

for k in kvalues:

    print "*************************************************"
    print "Running validation on k = " + str(k)
    
    kf = StratifiedKFold(n_splits=k)

    
    print "\n"
    print "============================================="
    
    totalsvm = 0           # Accuracy measure
    totalNB = 0
    totalrbf = 0
    totalMatSvm = np.zeros((2,2));  # Confusion matrix
    totalMatNB = np.zeros((2,2));
    totalMatRbf = np.zeros((2,2));

    for train_index, test_index in kf.split(data,labels):
        X_train = [data[i] for i in train_index]
        X_test = [data[i] for i in test_index]
        y_train = [labels[i] for i in train_index]
        y_test = [labels[i] for i in test_index]
        
        model1 = LinearSVC()
        model2 = MultinomialNB() 
        model3 = SVC()
    
        model1.fit(X_train,y_train)
        model2.fit(X_train,y_train)
        model3.fit(X_train,y_train)
    
        result1 = model1.predict(X_test)
        result2 = model2.predict(X_test)
        result3 = model3.predict(X_test)
    
        totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
        totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
        totalMatRbf = totalMatRbf + confusion_matrix(y_test, result3)
        
        totalsvm = totalsvm+sum(y_test==result1)
        totalNB = totalNB+sum(y_test==result2)
        totalrbf = totalrbf+sum(y_test==result3)
        
    print("totalMatSvm: ")
    print(totalMatSvm)
    print("totalsvm: ")
    print(totalsvm/count)
    print "\n"
    print("totalMatNB: ")
    print(totalMatNB)
    print("totalNB: ")
    print(totalNB/count)
    print "\n"
    print("totalMatRbf: ")
    print(totalMatRbf)
    print("totalrbf: ")
    print(totalrbf/count)
    print "\n"
    

*************************************************
Running validation on k = 2


totalMatSvm: 
[[ 3098.    73.]
 [  124.  2999.]]
totalsvm: 
0.968700349539


totalMatNB: 
[[ 2362.   809.]
 [  337.  2786.]]
totalNB: 
0.817921830315


totalMatRbf: 
[[  241.  2930.]
 [    0.  3123.]]
totalrbf: 
0.534477279949


*************************************************
Running validation on k = 3


totalMatSvm: 
[[ 3099.    72.]
 [  152.  2971.]]
totalsvm: 
0.96441054973


totalMatNB: 
[[ 2362.   809.]
 [  338.  2785.]]
totalNB: 
0.81776294884


totalMatRbf: 
[[  276.  2895.]
 [    0.  3123.]]
totalrbf: 
0.540038131554


*************************************************
Running validation on k = 10


totalMatSvm: 
[[ 3022.   149.]
 [   99.  3024.]]
totalsvm: 
0.960597394344


totalMatNB: 
[[ 2360.   811.]
 [  337.  2786.]]
totalNB: 
0.817604067366


totalMatRbf: 
[[  283.  2888.]
 [    0.  3123.]]
totalrbf: 
0.541150301875


