# Stratified k-Fold Cross Validation

### Vectorizers:
TFIDF, Token Count

### Machine Learning Models:
SVM (linear)
SVM (RBF)
Naive Bayes


### Description
These feature extraction methods and machine learning models are combined and ran using k-fold cross validation, with values of k = 2, 3, and 10. Feature statistics are also extracted for further analysis.

Import the required packages.

In [5]:
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import pandas as pd
import pprint
from ast import literal_eval

from __future__ import print_function


def train(k, data, labels):
    
    kf = StratifiedKFold(n_splits=k)
    
    totalNB_tfidf = 0       # Accuracy measure
    totalMatNB_tfidf = np.zeros((2,2));        # Confusion matrix
    totalNB_count = 0       # Accuracy measure
    totalMatNB_count = np.zeros((2,2));        # Confusion matrix
    
    totalRF_tfidf = 0
    totalMatRF_tfidf = np.zeros((2,2));
    totalRF_count = 0
    totalMatRF_count = np.zeros((2,2));
    
    for train_index, test_index in kf.split(data,labels):
        X_train = [data[i] for i in train_index]
        X_test = [data[i] for i in test_index]
        y_train = [labels[i] for i in train_index]
        y_test = [labels[i] for i in test_index]
        
        tfidfVectorizer = TfidfVectorizer(min_df=5,
                                     max_df = 0.8,
                                     sublinear_tf=True,
                                     use_idf=True,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
        
        countVectorizer = CountVectorizer(min_df=5,
                                     max_df = 0.8,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')

        X_train_tfidf = tfidfVectorizer.fit_transform(X_train) 
        X_test_tfidf = tfidfVectorizer.transform(X_test)
        
        X_train_count = countVectorizer.fit_transform(X_train)
        X_test_count = countVectorizer.transform(X_test)
        
        nbModel = MultinomialNB() 
        nbModel.fit(X_train_tfidf, y_train)
        nbResult = nbModel.predict(X_test_tfidf)
        totalMatNB_tfidf = totalMatNB_tfidf + confusion_matrix(y_test, nbResult)
        totalNB_tfidf = totalNB_tfidf+sum(y_test==nbResult)
        
        #rfModel = RandomForestClassifier(max_depth=2, random_state=0)
        #rfModel.fit(X_train_tfidf, y_train)
        #rfResult = rfModel.predict(X_test_tfidf)
        #totalMatRF_tfidf = totalMatRF_tfidf + confusion_matrix(y_test, rfResult)
        #totalRF_tfidf = totalRF_tfidf+sum(y_test==rfResult)
        
        nbModel = MultinomialNB() 
        nbModel.fit(X_train_count, y_train)
        nbResult = nbModel.predict(X_test_count)
        totalMatNB_count = totalMatNB_count + confusion_matrix(y_test, nbResult)
        totalNB_count = totalNB_count+sum(y_test==nbResult)
        
        #rfModel = RandomForestClassifier(max_depth=2, random_state=0)
        #rfModel.fit(X_train_count, y_train)
        #rfResult = rfModel.predict(X_test_count)
        #totalMatRF_count = totalMatRF_count + confusion_matrix(y_test, rfResult)
        #totalRF_count = totalRF_count+sum(y_test==rfResult)
    
    print("Confusion Matrix for Naive Bayes - TFIDF")
    print(totalMatNB_tfidf)
    print("Accuracy: ")
    print(totalNB_tfidf/count)
    print ()
    
    print("Confusion Matrix for Naive Bayes - Count")
    print(totalMatNB_count)
    print("Accuracy: ")
    print(totalNB_count/count)
    print ()

    
    #print("Confusion Matrix for Random Forests - TFIDF")
    #print(totalMatRF_tfidf)
    #print("Accuracy: ")
    #print(totalRF_tfidf/count)
    #print ()
    
    #print("Confusion Matrix for Random Forests - Count")
    #print(totalMatRF_count)
    #print("Accuracy: ")
    #print(totalRF_count/count)
    #print ()


def trainSVM(k, X_train, X_test, y_train, y_test):
    
    print ("*************** SVM ***************")
    print ()
    
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters %s" % score)
        print()

        clf = GridSearchCV(SVC(), tuned_parameters, cv=k,
                           scoring='%s_macro' % score)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

        
def trainRF(k, X_train, X_test, y_train, y_test):
    
    print ("*************** RF ***************")
    print ()
    
    rfc = RandomForestClassifier(max_depth=2, random_state=0)

    param_grid = { 
        'n_estimators': [250, 500, 750, 1000, 1250, 1500],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    
    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters %s" % score)
        print()

        clf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=k,
                           scoring='%s_macro' % score)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

Read the dataset, and create the data and label arrays.

In [4]:
corpus = pd.read_csv('gm_equal.csv');

data = []
labels = []
for news in corpus['text']:
    data.append(news)

for news in corpus['class']:
    labels.append(news)

print ("Total number of articles: " + str(len(data)))

Total number of articles: 6246


## Training and Validation

In [6]:
#Stratified 10-cross fold validation with SVM and Multinomial NB 
      
kvalues = [2, 3, 10]
count = float(len(data))

for k in kvalues:

    print ("*************************************************")
    print ("Running validation on k = " + str(k))
    print ()
    
    kf = StratifiedKFold(n_splits=k)
    
    
    #Naive Bayes
     
    train(k, data, labels)
    
    
    #SVM
    
    X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=0.5, random_state=0)
    
    
    print ("********** TFIDF **********")
    print ()
    
    tfidfVectorizer = TfidfVectorizer(min_df=5,
                                     max_df = 0.8,
                                     sublinear_tf=True,
                                     use_idf=True,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
    
    X_train_tfidf = tfidfVectorizer.fit_transform(X_train) 
    X_test_tfidf = tfidfVectorizer.transform(X_test)
    
    trainSVM(k, X_train_tfidf, X_test_tfidf, y_train, y_test)
    trainRF(k, X_train_tfidf, X_test_tfidf, y_train, y_test)
    
    
    print ("********** COUNT **********")
    print ()
    
    countVectorizer = CountVectorizer(min_df=5,
                                     max_df = 0.8,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
    
    X_train_count = countVectorizer.fit_transform(X_train)
    X_test_count = countVectorizer.transform(X_test)
    
    trainSVM(k, X_train_count, X_test_count, y_train, y_test)
    trainRF(k, X_train_count, X_test_count, y_train, y_test)
    

*************************************************
Running validation on k = 2

Confusion Matrix for Naive Bayes - TFIDF
[[ 2934.   192.]
 [  574.  2546.]]
Accuracy: 
0.877361511367

Confusion Matrix for Naive Bayes - Count
[[ 2764.   362.]
 [  403.  2717.]]
Accuracy: 
0.877521613833

********** TFIDF **********

*************** SVM ***************

# Tuning hyper-parameters precision



  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'kernel': 'linear', 'C': 10}

Grid scores on development set:

0.250 (+/-0.000) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.250 (+/-0.000) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.250 (+/-0.000) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.250 (+/-0.000) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.879 (+/-0.010) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.250 (+/-0.000) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.922 (+/-0.010) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.879 (+/-0.010) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.920 (+/-0.011) for {'kernel': 'linear', 'C': 1}
0.924 (+/-0.011) for {'kernel': 'linear', 'C': 10}
0.919 (+/-0.020) for {'kernel': 'linear', 'C': 100}
0.919 (+/-0.020) for {'kernel': 'linear', 'C': 1000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    

In [None]:
rfc = RandomForestClassifier(max_depth=2, random_state=0)
#RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y)
print CV_rfc.best_params_


In [3]:
kvalues = [2,3,10]
count = float(len(data))

for k in kvalues:

    print ("*************************************************")
    print ("Running validation on k = " + str(k))
    print ()
    
    kf = StratifiedKFold(n_splits=k)
    
    X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=0.5, random_state=0)
    
    print ("*************** Random Forests Grid Search ***************")
    print ()
    print ("********** TFIDF **********")
    print ()
    
    tfidfVectorizer = TfidfVectorizer(min_df=5,
                                     max_df = 0.8,
                                     sublinear_tf=True,
                                     use_idf=True,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
    
    X_train_tfidf = tfidfVectorizer.fit_transform(X_train) 
    X_test_tfidf = tfidfVectorizer.transform(X_test)
    
    #trainSVM(k, X_train_tfidf, X_test_tfidf, y_train, y_test)
    rfc = RandomForestClassifier(max_depth=2, random_state=0)
    #RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

    param_grid = { 
        'n_estimators': [250, 500, 750, 1000, 1250, 1500],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    
    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters %s" % score)
        print()

        clf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=k,
                           scoring='%s_macro' % score)
        clf.fit(X_train_tfidf, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test_tfidf)
        print(classification_report(y_true, y_pred))
        print()


    print ("********** COUNT **********")
    print ()
    
    countVectorizer = CountVectorizer(min_df=5,
                                     max_df = 0.8,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
    
    X_train_count = countVectorizer.fit_transform(X_train)
    X_test_count = countVectorizer.transform(X_test)
    
    rfc = RandomForestClassifier(max_depth=2, random_state=0)
    #RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

    param_grid = { 
        #"n_estimators" : [9, 18, 27, 36, 45, 54, 63],
        #"max_depth" : [1, 5, 10, 15, 20, 25, 30],
        #"min_samples_leaf" : [1, 2, 4, 6, 8, 10],
        'n_estimators': [250, 500, 750, 1000, 1250, 1500],
        'max_features': ['auto', 'sqrt', 'log2'],
    }

    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters %s" % score)
        print()

        clf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=k,
                           scoring='%s_macro' % score)
        clf.fit(X_train_tfidf, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test_tfidf)
        print(classification_report(y_true, y_pred))
        print()
    

*************************************************
Running validation on k = 2

*************** Random Forests Grid Search ***************

********** TFIDF **********

# Tuning hyper-parameters precision

Best parameters set found on development set:

{'max_features': 'log2', 'n_estimators': 1250}

Grid scores on development set:

0.801 (+/-0.017) for {'max_features': 'auto', 'n_estimators': 250}
0.801 (+/-0.004) for {'max_features': 'auto', 'n_estimators': 500}
0.802 (+/-0.005) for {'max_features': 'auto', 'n_estimators': 750}
0.799 (+/-0.006) for {'max_features': 'auto', 'n_estimators': 1000}
0.799 (+/-0.004) for {'max_features': 'auto', 'n_estimators': 1250}
0.797 (+/-0.006) for {'max_features': 'auto', 'n_estimators': 1500}
0.801 (+/-0.017) for {'max_features': 'sqrt', 'n_estimators': 250}
0.801 (+/-0.004) for {'max_features': 'sqrt', 'n_estimators': 500}
0.802 (+/-0.005) for {'max_features': 'sqrt', 'n_estimators': 750}
0.799 (+/-0.006) for {'max_features': 'sqrt', 'n_estimators':