This is a basic template for the parameter sweeps you will be performing with scikit-learn.

We will need to add more data sets to the `data` directory. Also, loops like this will need to be created for all models listed in the project document.

In [1]:
from glob import glob
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
import itertools
from collections import defaultdict

all_model_scores = defaultdict(list)

# For every data set in the data repository...
for dataset in glob('data/*csv'):
    # Read the data set into memory
    input_data = pd.read_csv(dataset)
    
    for dataset_repeat in range(1, 31):
        # Divide the data set into a training and testing sets, each time with a different RNG seed
        training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values,
                                                                             n_iter=1,
                                                                             train_size=0.75,
                                                                             test_size=0.25,
                                                                             random_state=dataset_repeat)))
        
        training_features = input_data.loc[training_indices].drop('class', axis=1).values
        training_classes = input_data.loc[training_indices, 'class'].values

        testing_features = input_data.loc[testing_indices].drop('class', axis=1).values
        testing_classes = input_data.loc[testing_indices, 'class'].values
        
        # These are parameters listed in the project document
        max_depth_list = [1, 2, 3, 4, 5, 10, 20, 50, None]
        max_features_list = [0.1, 0.25, 0.5, 0.75, 'sqrt', 'log2', None]
        criterion_list = ['gini', 'entropy']
        
        # For every DecisionTreeClassifier parameter combination...
        for (max_depth, max_features, criterion) in itertools.product(max_depth_list,
                                                                      max_features_list,
                                                                      criterion_list):
            # Create and fit the model on the training data
            clf = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, criterion=criterion)
            clf.fit(training_features, training_classes)
            
            # Store the model's score with the key (name of data set, model, model parameters)
            all_model_scores[(dataset.strip('data/').strip('.csv'),
                              DecisionTreeClassifier,
                              (max_depth, max_features, criterion))].append(clf.score(testing_features,
                                                                                      testing_classes))

In [2]:
all_model_scores

defaultdict(list,
            {('Hill_Valley_with_noise',
              sklearn.tree.tree.DecisionTreeClassifier,
              (1, 0.1, 'entropy')): [0.51644736842105265,
              0.47697368421052633,
              0.5,
              0.48355263157894735,
              0.49342105263157893,
              0.51644736842105265,
              0.52302631578947367,
              0.49671052631578949,
              0.49671052631578949,
              0.53289473684210531,
              0.49342105263157893,
              0.5,
              0.53618421052631582,
              0.48684210526315791,
              0.50986842105263153,
              0.49342105263157893,
              0.54934210526315785,
              0.48355263157894735,
              0.49342105263157893,
              0.52302631578947367,
              0.52960526315789469,
              0.51315789473684215,
              0.49342105263157893,
              0.51973684210526316,
              0.49342105263157893,
              0.5065

Once the analysis is finished running, make sure to store all of the model scores (with their corresponding labels -- data set, classifier, and parameters) to a file!

Make sure to commit the notebook (with code and annotations) *and* the score data to this repository once the analysis is finished.