In [7]:
from glob import glob
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
import itertools
from collections import defaultdict

all_model_scores = defaultdict(list)

# For every data set in the data repository...
for dataset in glob('data/ecoli.csv.gz'):
    # Read the data set into memory
    input_data = pd.read_csv(dataset, compression='gzip')
    
    for dataset_repeat in range(1, 31):
        # Divide the data set into a training and testing sets, each time with a different RNG seed
        training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values,
                                                                             n_iter=1,
                                                                             train_size=0.75,
                                                                             test_size=0.25,
                                                                             random_state=dataset_repeat)))
        
        training_features = input_data.loc[training_indices].drop('class', axis=1).values
        training_classes = input_data.loc[training_indices, 'class'].values

        testing_features = input_data.loc[testing_indices].drop('class', axis=1).values
        testing_classes = input_data.loc[testing_indices, 'class'].values
        
        # These are parameters listed in the project document
        loss_list = ['deviance', 'exponential']
        learning_rate_list = [0.01, 0.1, 0.5, 1.0, 10.0, 50.0, 100.0]
        n_estimators_list = [10, 50, 100, 500, 1000]
        max_depth_list = [1, 2, 3, 4, 5, 10, 20, 50, None]
        max_features =[0.1, 0.25, 0.5, 0.75, 'sqrt', 'log2', None]
        warm_start = [True, False]
        
        # For every DecisionTreeClassifier parameter combination...
        for (loss, learning_rate, n_estimators, max_depth, max_features, warm_state) in itertools.product(loss_list,
                                                                      learning_rate_list,
                                                                      n_estimators_list, max_depth_list, 
                                                                      max_features_list, warm_state_list):
            # Create and fit the model on the training data
            clf = GradientBoostingClassifier(loss = loss, learning_rate = learning_rate, n_estimators = n_estimators,
                                                    max_depth = max_depth, max_features = max_features, warm_state = warm_state)
            clf.fit(training_features, training_classes)
            
            # Store the model's score with the key (name of data set, model, model parameters)
            all_model_scores[(dataset,
                              'GradientBoostingClassifier',
                              (loss, learning_rate, n_estimators, max_depth, max_features, warm_state)))].append(clf.score(testing_features,
                                                                                      testing_classes))

KeyboardInterrupt: 