In [1]:
from glob import glob
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit
import itertools
from collections import defaultdict

all_model_scores = defaultdict(list)

# For every data set in the data repository...
for dataset in glob('data/ecoli.csv.gz'):
    # Read the data set into memory
    input_data = pd.read_csv(dataset, compression='gzip')
    
    for dataset_repeat in range(1, 31):
        # Divide the data set into a training and testing sets, each time with a different RNG seed
        training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values,
                                                                             n_iter=1,
                                                                             train_size=0.75,
                                                                             test_size=0.25,
                                                                             random_state=dataset_repeat)))
        
        training_features = input_data.loc[training_indices].drop('class', axis=1).values
        training_classes = input_data.loc[training_indices, 'class'].values

        testing_features = input_data.loc[testing_indices].drop('class', axis=1).values
        testing_classes = input_data.loc[testing_indices, 'class'].values
        
        # These are parameters listed in the project document
        alpha_list = [0.0, 0.1, 0.25, 0.5, 0.75, 1.0]
        fit_prior_list = [True, False]
        # For every DecisionTreeClassifier parameter combination...
        for (alpha, fit_prior) in itertools.product(alpha_list, fit_prior_list):
            # Create and fit the model on the training data
            clf = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
            clf.fit(training_features, training_classes)
            
            # Store the model's score with the key (name of data set, model, model parameters)
            all_model_scores[(dataset,
                              'MultinomialNB',
                              (alpha, fit_prior))].append(clf.score(testing_features, testing_classes))

In [2]:
import os
import gzip

if not os.path.isdir('benchmark_results'):
    os.mkdir('benchmark_results')

with gzip.open('benchmark_results/MultinomialNB_eoli-benchmarks.tsv.gz', 'wb') as out_file:
    header_text = '\t'.join(['dataset',
                              'model',
                              'parameters',
                              'testing_score']) + '\n'
    out_file.write(header_text.encode('UTF-8'))
    
    for (dataset, model, params) in all_model_scores:
        param_string = ''
        param_string += 'alpha={},'.format(params[0])
        param_string += 'fit_prior={},'.format(params[1])
        
        for testing_score in all_model_scores[(dataset, model, params)]:
            out_text = '\t'.join([dataset[5:-7],
                                  model,
                                  param_string,
                                  str(testing_score)]) + '\n'
            out_file.write(out_text.encode('UTF-8'))

In [4]:
import pandas as pd

pd.read_csv('benchmark_results/MultinomialNB_eoli-benchmarks.tsv.gz', compression='gzip', sep='\t')

Unnamed: 0,dataset,model,parameters,testing_score
0,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
1,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
2,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
3,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
4,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
5,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
6,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
7,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
8,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
9,ecoli,MultinomialNB,"alpha=0.1,fit_prior=True,",0.439024
