In [76]:
import numpy as np
import pandas as pd
from sklearn import svm, linear_model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
import os
import glob
from itertools import product

In [77]:
files = os.path.join(os.getcwd(),"csv_files", "*.csv")
print(files)
datasets = glob.glob(files)
datasets

/home/pranav/Project/csv_files/*.csv


['/home/pranav/Project/csv_files/electricity_normalised.csv',
 '/home/pranav/Project/csv_files/pc4.csv',
 '/home/pranav/Project/csv_files/credit.csv',
 '/home/pranav/Project/csv_files/MagicTelescope.csv',
 '/home/pranav/Project/csv_files/irish.csv',
 '/home/pranav/Project/csv_files/pc1.csv',
 '/home/pranav/Project/csv_files/tic-tac-toe.csv',
 '/home/pranav/Project/csv_files/ionosphere.csv',
 '/home/pranav/Project/csv_files/diabetes.csv']

In [78]:
models = [linear_model.LogisticRegression(), linear_model.Perceptron()]

In [79]:
def read_csv(file_path):
    X = []
    dataframe = pd.read_csv(file_path)
    category = np.logical_not(
        np.logical_or(np.array(dataframe.dtypes == np.float64), np.array(dataframe.dtypes == np.int64)))
    for cat_cols in dataframe.select_dtypes(['object']).columns:
        dataframe[cat_cols] = dataframe[cat_cols].astype('category')
        dataframe[cat_cols] = pd.Categorical.from_array(dataframe[cat_cols]).codes
    for cat_cols in dataframe.select_dtypes(['bool']).columns:
        dataframe[cat_cols] = dataframe[cat_cols].astype('category')
        dataframe[cat_cols] = pd.Categorical.from_array(dataframe[cat_cols]).codes  # Have to see an alternative

    classes = np.array(dataframe[dataframe.columns[-1]])
    a_enc = pd.factorize(classes)
    Y = np.array(a_enc[0])
    for row in dataframe.as_matrix():
        row = np.array(row)
        X.append(np.array(row[0:len(row) - 1]).astype(float))
    X = np.array(X)
    return X, Y

In [80]:
def compute(model, dataset, folds =10 , seed=42):
    X, y = read_csv(dataset)
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    accs = np.zeros(folds)   
    f1_scores = np.zeros(folds)
    i = 0 
    for index_train, index_test in kf.split(X,y):
        model.fit(X[index_train], y[index_train])

        y_true = y[index_test]
        y_pred = model.predict(X[index_test])
        accs[i] = accuracy_score(y_true, y_pred)
        f1_scores[i] = f1_score(y_true, y_pred, average='micro')
        i+=1
    return accs, f1_scores 

In [81]:
shape = (len(datasets), len(models), 10)
accuracies = np.zeros(shape)
f1_scores = np.zeros(shape)
for i, d in enumerate(datasets):
    for j, m in enumerate(models):
        print(d)
        accuracies[i,j] = compute(m, d)[0]
        f1_scores[i,j] = compute(m, d)[1]

/home/pranav/Project/csv_files/electricity_normalised.csv




/home/pranav/Project/csv_files/electricity_normalised.csv
/home/pranav/Project/csv_files/pc4.csv




/home/pranav/Project/csv_files/pc4.csv
/home/pranav/Project/csv_files/credit.csv
/home/pranav/Project/csv_files/credit.csv
/home/pranav/Project/csv_files/MagicTelescope.csv
/home/pranav/Project/csv_files/MagicTelescope.csv
/home/pranav/Project/csv_files/irish.csv
/home/pranav/Project/csv_files/irish.csv
/home/pranav/Project/csv_files/pc1.csv
/home/pranav/Project/csv_files/pc1.csv
/home/pranav/Project/csv_files/tic-tac-toe.csv
/home/pranav/Project/csv_files/tic-tac-toe.csv
/home/pranav/Project/csv_files/ionosphere.csv
/home/pranav/Project/csv_files/ionosphere.csv
/home/pranav/Project/csv_files/diabetes.csv
/home/pranav/Project/csv_files/diabetes.csv


In [82]:
accuracies

array([[[ 0.75595763,  0.75198588,  0.75617829,  0.74382171,  0.75419241,
          0.72964026,  0.75369676,  0.7593819 ,  0.75562914,  0.76666667],
        [ 0.6760812 ,  0.71888791,  0.69218888,  0.64033539,  0.75022065,
          0.72875745,  0.6623262 ,  0.73620309,  0.47527594,  0.72251656]],

       [[ 0.89041096,  0.9109589 ,  0.93150685,  0.89726027,  0.93150685,
          0.89726027,  0.91780822,  0.91780822,  0.89655172,  0.91034483],
        [ 0.20547945,  0.87671233,  0.55479452,  0.87671233,  0.87671233,
          0.87671233,  0.87671233,  0.87671233,  0.22758621,  0.88275862]],

       [[ 0.73      ,  0.7       ,  0.68      ,  0.71      ,  0.74      ,
          0.71      ,  0.75      ,  0.74      ,  0.73      ,  0.72      ],
        [ 0.68      ,  0.7       ,  0.7       ,  0.7       ,  0.7       ,
          0.7       ,  0.7       ,  0.31      ,  0.3       ,  0.7       ]],

       [[ 0.97635313,  0.97214924,  0.9721346 ,  0.97528917,  0.97108307,
          0.9679285 ,  0.9

In [83]:
f1_scores

array([[[ 0.75595763,  0.75198588,  0.75617829,  0.74382171,  0.75419241,
          0.72964026,  0.75369676,  0.7593819 ,  0.75562914,  0.76666667],
        [ 0.6760812 ,  0.71888791,  0.69218888,  0.64033539,  0.75022065,
          0.72875745,  0.6623262 ,  0.73620309,  0.47527594,  0.72251656]],

       [[ 0.89041096,  0.9109589 ,  0.93150685,  0.89726027,  0.93150685,
          0.89726027,  0.91780822,  0.91780822,  0.89655172,  0.91034483],
        [ 0.20547945,  0.87671233,  0.55479452,  0.87671233,  0.87671233,
          0.87671233,  0.87671233,  0.87671233,  0.22758621,  0.88275862]],

       [[ 0.73      ,  0.7       ,  0.68      ,  0.71      ,  0.74      ,
          0.71      ,  0.75      ,  0.74      ,  0.73      ,  0.72      ],
        [ 0.68      ,  0.7       ,  0.7       ,  0.7       ,  0.7       ,
          0.7       ,  0.7       ,  0.31      ,  0.3       ,  0.7       ]],

       [[ 0.97635313,  0.97214924,  0.9721346 ,  0.97528917,  0.97108307,
          0.9679285 ,  0.9