# Stratified k-Fold Cross Validation

### Vectorizers:
TFIDF, Token Count

### Machine Learning Models:
SVM (linear)
SVM (RBF)
Naive Bayes


### Description
These feature extraction methods and machine learning models are combined and ran using k-fold cross validation, with values of k = 2, 3, and 10. Feature statistics are also extracted for further analysis.

Import the required packages.

In [7]:
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import pandas as pd
import pprint
from ast import literal_eval

from __future__ import print_function

Read the dataset, and create the data and label arrays.

In [2]:
corpus = pd.read_csv('gm_fake_or_real_equal.csv');

data = []
labels = []
for news in corpus['text']:
    data.append(news)

for news in corpus['class']:
    labels.append(news)

print "Total number of articles: " + str(len(data))

### Feature Extraction

We extract the top features from each vectorizer.

In [None]:
vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True, stop_words='english',
                                 max_features=25,
                                 token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
tf_idf = vectorizer.fit_transform(data)
    
features = vectorizer.get_feature_names()

print "Top 25 features for TFIDF: "
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(features)

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df = 0.8, stop_words='english',
                                 max_features=25,
                                 token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
tf_idf = vectorizer.fit_transform(data)
    
features = vectorizer.get_feature_names()

print "Top 25 features for Count: "
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(features)

## Training and Validation

In [6]:
#Stratified 10-cross fold validation with SVM and Multinomial NB 
      
kvalues = [2, 3, 10]
count = float(len(data))

for k in kvalues:

    print "*************************************************"
    print "Running validation on k = " + str(k)
    
    kf = StratifiedKFold(n_splits=k)
    

    print "\n"
    print "============================================="
    print "Feature extraction: TFIDF\n"
    
    totalsvm = 0           # Accuracy measure
    totalNB = 0
    totalrbf = 0
    totalMatSvm = np.zeros((2,2));  # Confusion matrix
    totalMatNB = np.zeros((2,2));
    totalMatRbf = np.zeros((2,2));

    for train_index, test_index in kf.split(data,labels):
        X_train = [data[i] for i in train_index]
        X_test = [data[i] for i in test_index]
        y_train = [labels[i] for i in train_index]
        y_test = [labels[i] for i in test_index]
    
        vectorizer = TfidfVectorizer(min_df=5,
                                     max_df = 0.8,
                                     sublinear_tf=True,
                                     use_idf=True,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')

        train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
        test_corpus_tf_idf = vectorizer.transform(X_test)
    
        model1 = LinearSVC()
        model2 = MultinomialNB() 
        model3 = SVC(probability=True, C=1000)
    
        model1.fit(train_corpus_tf_idf,y_train)
        model2.fit(train_corpus_tf_idf,y_train)
        model3.fit(train_corpus_tf_idf,y_train)
    
        result1 = model1.predict(test_corpus_tf_idf)
        result2 = model2.predict(test_corpus_tf_idf)
        result3 = model3.predict(test_corpus_tf_idf)
    
        totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
        totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
        totalMatRbf = totalMatRbf + confusion_matrix(y_test, result3)
        
        totalsvm = totalsvm+sum(y_test==result1)
        totalNB = totalNB+sum(y_test==result2)
        totalrbf = totalrbf+sum(y_test==result3)
        
    print("totalMatSvm: ")
    print(totalMatSvm)
    print("totalsvm: ")
    print(totalsvm/count)
    print "\n"
    print("totalMatNB: ")
    print(totalMatNB)
    print("totalNB: ")
    print(totalNB/count)
    print "\n"
    print("totalMatRbf: ")
    print(totalMatRbf)
    print("totalrbf: ")
    print(totalrbf/count)
    print "\n"
    
    
    
    
    print "============================================="
    print "Feature extraction: Token Count"
    print "\n"
    
    totalsvm = 0           # Accuracy measure
    totalNB = 0
    totalrbf = 0
    totalMatSvm = np.zeros((2,2));  # Confusion matrix
    totalMatNB = np.zeros((2,2));
    totalMatRbf = np.zeros((2,2));
    
    for train_index, test_index in kf.split(data,labels):
        X_train = [data[i] for i in train_index]
        X_test = [data[i] for i in test_index]
        y_train = [labels[i] for i in train_index]
        y_test = [labels[i] for i in test_index]
    
        vectorizer = CountVectorizer(min_df=5,
                                     max_df = 0.8,
                                     stop_words='english',
                                     token_pattern=ur'(?u)\b[^\W\d][^\W\d]+\b')
        train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
        test_corpus_tf_idf = vectorizer.transform(X_test)
    
        model1 = LinearSVC()
        model2 = MultinomialNB() 
        model3 = SVC(probability=True, C=1000)
    
        model1.fit(train_corpus_tf_idf,y_train)
        model2.fit(train_corpus_tf_idf,y_train)
        model3.fit(train_corpus_tf_idf,y_train)
    
        result1 = model1.predict(test_corpus_tf_idf)
        result2 = model2.predict(test_corpus_tf_idf)
        result3 = model3.predict(test_corpus_tf_idf)
    
        totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
        totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
        totalMatRbf = totalMatRbf + confusion_matrix(y_test, result3)
        
        totalsvm = totalsvm+sum(y_test==result1)
        totalNB = totalNB+sum(y_test==result2)
        totalrbf = totalrbf+sum(y_test==result3)
        
    print("totalMatSvm: ")
    print(totalMatSvm)
    print("totalsvm: ")
    print(totalsvm/count)
    print "\n"
    print("totalMatNB: ")
    print(totalMatNB)
    print("totalNB: ")
    print(totalNB/count)
    print "\n"
    print("totalMatRbf: ")
    print(totalMatRbf)
    print("totalrbf: ")
    print(totalrbf/count)
    print "\n"

*************************************************
Running validation on k = 2


Feature extraction: TFIDF

Number of fake samples: 
1564
Number of real samples: 
1564
Number of fake tests: 
1564
Number of real tests: 
1564


KeyboardInterrupt: 