In [None]:
# Imports
import pandas as pd
import numpy as np
import math as mt
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Util Func's
def get_map(series):
    _map = {}
    for i, col in enumerate(series.unique()):
        _map[col] = i
    return _map

@ignore_warnings(category=ConvergenceWarning)
def simulate(models, xTrain, yTrain, xTest, yTest):
    errors = []
    for model in models:
        results = cross_validate(model, xTrain, yTrain, cv=3)
        scores = results['test_score']
        errors.append((1-(sum(scores)/len(scores)))*100.)
    return errors

# **Introduction** #
Supervised learning gives us an opportunity to apply mapping functions to training data in order tomake predictions.  These predictions can help much larger artificial intelligent systems make betterdecisions.  There are several models that can be used to make predictions.  We will be introduced tofive different machine learning models: kNN, Neural Network, Decision Tree, Boosting and SVC. We will experiment and examine the effectiveness of each model for two different classification problems.

In [None]:
# Load Mushroom Data
shroom_data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
# Load NASA Datta
nasa_data = pd.read_csv('../input/nasa-asteroids-classification/nasa.csv')

# Data Preparation
for col in shroom_data.columns:
    shroom_data[col] = shroom_data[col].map(get_map(shroom_data[col]))    
for col in ['Hazardous']:
    nasa_data[col] = nasa_data[col].map(get_map(nasa_data[col]))

# **Data Extraction** #
For the purpose of this project we will be doing a random 20/80 test to train split on our data.

In [None]:
# Shroom Data Split
split = .20
test_sz = mt.ceil(len(shroom_data)*split)
train_sz = mt.floor(len(shroom_data)*(1-split))
shroom_train, shroom_test = train_test_split(shroom_data, train_size=train_sz, test_size=test_sz)

# Nasa Data Split
test_sz = mt.ceil(len(nasa_data)*split)
train_sz = mt.floor(len(nasa_data)*(1-split))
nasa_train, nasa_test = train_test_split(nasa_data, train_size=train_sz, test_size=test_sz)

# **Feature Extraction** #
We will use some manual data mining and educational guessing to figure out which features will produce the best results.

In [None]:
# Feature Selection
nasa_ftrs = ['Absolute Magnitude', 'Est Dia in Miles(min)', 'Est Dia in Miles(max)', 'Miles per hour', 'Miss Dist.(miles)', 'Orbit Uncertainity', 'Minimum Orbit Intersection', 'Absolute Magnitude', 'Orbit Uncertainity', 'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant', 'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination',
       'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion']
shroom_ftrs = ['cap-shape','cap-color','gill-size','gill-color', 'veil-type', 'veil-color', 'population']

# Feature Extraction
shroom_xTrain = shroom_train.copy()
shroom_xTest  = shroom_test.copy()
nasa_xTrain = nasa_train.copy()
nasa_xTest  = nasa_test.copy()

shroom_xTrain = shroom_xTrain[shroom_ftrs] / shroom_xTrain[shroom_ftrs].max().replace(to_replace=0, method='ffill')
shroom_yTrain = shroom_train['class']
shroom_xTest  = shroom_xTest[shroom_ftrs] / shroom_xTest[shroom_ftrs].max().replace(to_replace=0, method='ffill')
shroom_yTest  = shroom_test['class']

nasa_xTrain = nasa_xTrain[nasa_ftrs] / nasa_xTrain[nasa_ftrs].max()
nasa_yTrain = nasa_train['Hazardous']
nasa_xTest  = nasa_xTest[nasa_ftrs] / nasa_xTest[nasa_ftrs].max()
nasa_yTest  = nasa_test['Hazardous']

# **Problem 1: Hazardous Asteroid Classification** #
**Description**

Everyone understands the potential impact asteroids can have on the planet.  After wiping out anentire species and reshaping our planet, we know they can be life threatening to anyone in the world.However, not all asteroids come in the same shape or size and identifying which are hazardous is veryimportant.  Machine learning algorithms can help with this type of classification problem.  UtilizingNASA’s Near Earth Object Web Service, an open API used to query asteroid information, we canapply machine learning models to classify hazardous asteroids

**Results**

Below, we describe the initial results of our models.  Each model is from the ’sklearn’ library.  Weused each model’s default settings as the initial results.  These default values are:
* DecisionTreeClassifier - Criterion=Entropy, Splitter=Best, MaxFeatuures=None, MaxNodes=None
* MLPClassifier - HiddenLayers=100, ActivationFunction=Relu, Solver=adam, Alpha=0.0001, LearningRate=Constant(0.001), MaxIterations=100 beta1=0.9, beta2=0.999, epsilon=1e-8
* SVC - kernel=rbf, degree=3, gamma= 1/(n\_features * X.var())
* KNeighborsClassifier - N-Neighbors=5, Weights=uniform
* RandomForestClassifier - Criterion=Entropy, MaxFeature=auto, MaxLeafNodes=None, TreeCount=100

In [None]:
nasa_models = [ DecisionTreeClassifier(criterion='entropy'), MLPClassifier(max_iter=100), SVC(), KNeighborsClassifier(), RandomForestClassifier(criterion='entropy') ]
baselineMap = {}

# Nasa Models
baselineMap['NASA'] = simulate(nasa_models, nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest)

nasa_df = pd.DataFrame(baselineMap['NASA'], columns=['NASA'], index=['Decision Tree', 'Neural Network', 'SVC', 'KNN', 'Boosting'])
nasa_baseline_chart = nasa_df.plot.bar(rot=0, title='Model Baseline Error', xlabel='Models', ylabel='Percent Error')

In [None]:
pd.DataFrame(baselineMap['NASA'], columns=['NASA'], index=['Decision Tree', 'Neural Network', 'SVC', 'KNN', 'Boosting'])

# **Analysis** #

For the analysis, we will experiment with a variety of model configurations and analyze their effects on error.  Ideally, we would like to find the best combination of configurations which produces the ’best’ results.  Best in the context of the Asteroid Classification problem will be maximizing the average test score from a cross validation with three folds.

**DecisionTree Experiments:**

For the Decision Tree analysis, we experimented with three different configuration settings:  Criterion, Max Features and Splitter Algorithm.

In [None]:
dt_experiments = {
    'MaxFeatures' : [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ]
  , 'MaxLeafNodes' : [ 10, 25, 50, 100, 150, 200, 400, 500, 1000, 1500, 2000]
}

def generate_DT_test_models():
    experiment_models = { 'Criterion' : [ DecisionTreeClassifier(criterion='gini')  ], 'MaxFeatures'  : [ ]
                        , 'Splitter'  : [ DecisionTreeClassifier(splitter='random') ], 'MaxLeafNodes' : [ ] }
    experiment_results = { 'Criterion' : [ ], 'MaxFeatures' : [ ], 'Splitter' : [ ], 'MaxLeafNodes' : [ ] }
    
    for key in dt_experiments.keys():
        for setting in dt_experiments[key]:
            if ('MaxFeatures' == key):
                experiment_models[key].append(DecisionTreeClassifier(max_features=setting))
            elif ('MaxLeafNodes' == key):
                experiment_models[key].append(DecisionTreeClassifier(max_leaf_nodes=setting))
    
    return experiment_models, experiment_results

In [None]:
# Nasa Models
all_models, experiment_results = generate_DT_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest))

# Criterion: Entropy v Gini
data = [ baselineMap['NASA'][0], experiment_results['Criterion'][0][0] ]
df = pd.DataFrame(data, columns=['NASA'], index=['Entropy', 'Gini'])
chart = df.plot.bar(rot=0, title='Entropy v Gini', xlabel='Criterion', ylabel='Percent Error')

# Splitter: Best v Random
data = [ baselineMap['NASA'][0], experiment_results['Splitter'][0][0] ]
df = pd.DataFrame(data, columns=['NASA'], index=['Best', 'Random'])
chart = df.plot.bar(rot=0, title='Best v Random', xlabel='Splitter', ylabel='Percent Error')

# Effects of Max Features
data = experiment_results['MaxFeatures'][0]
df = pd.DataFrame(data, columns=['NASA'], index=dt_experiments['MaxFeatures'])
chart = df.plot(title='Effects of Max Features', xlabel='Max Features', ylabel='Percent Error')

# Effects of Max Leaf Nodes
data = experiment_results['MaxLeafNodes'][0]
df = pd.DataFrame(data, columns=['NASA'], index=dt_experiments['MaxLeafNodes'])
chart = df.plot(title='Effects of Max Leaf Nodes', xlabel='# of Nodes', ylabel='Percent Error')

Analyzing the experiments, we see many interesting data points that can help improve our model. For the criterion of the model, we see that entropy is much better than gini. The splitter has more success with the best algorithm. The last two experiments help configure our tree. Max features is a ratio that tells the algorithm the ratio of maximum features when considering a split. We see that initially there is much larger error when considering less features, which is something we can logically assume. We want to find the sweet spot, which we would consider to be the lowest dip in the curve. We observe this point as somewhere between 0.6 to 0.7. Lastly, we look at the effects of limiting the maximum number of leaf nodes. Since we see a very low dip initially, we can assume that 10-50 should be an ideal configuration. 

**Boosting Experiments:**

For the Random Forest classifier analysis, we experimented with three different configuration settings:  Criterion, Number of Trees, Max Features and Max Leaf Nodes.

In [None]:
rf_experiments = {
    'Trees' : [ 1, 2, 5, 10, 20, 30, 40, 50, 75, 100 ]
  , 'MaxFeatures' : [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ]
  , 'MaxLeafNodes' : [ 10, 25, 50, 100, 150, 200, 400, 500, 1000, 1500, 2000]
}

def generate_RF_test_models():
    experiment_models = { 'Criterion' : [ DecisionTreeClassifier(criterion='gini')  ], 'MaxFeatures'  : [ ]
                        , 'Trees'  : [ ], 'MaxLeafNodes' : [ ] }
    experiment_results = { 'Criterion' : [ ], 'MaxFeatures' : [ ], 'Trees' : [ ], 'MaxLeafNodes' : [ ] }
    
    for key in rf_experiments.keys():
        for setting in rf_experiments[key]:
            if ('Trees' == key):
                experiment_models[key].append(RandomForestClassifier(n_estimators=setting))
            elif ('MaxFeatures' == key):
                experiment_models[key].append(RandomForestClassifier(max_features=setting))
            elif ('MaxLeafNodes' == key):
                experiment_models[key].append(RandomForestClassifier(max_leaf_nodes=setting))
    
    return experiment_models, experiment_results

In [None]:
# Nasa Models
all_models, experiment_results = generate_RF_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest))

# Criterion: Entropy v Gini
data = [ baselineMap['NASA'][0], experiment_results['Criterion'][0][0] ]
df = pd.DataFrame(data, columns=['NASA'], index=['Entropy', 'Gini'])
chart = df.plot.bar(rot=0, title='Entropy v Gini', xlabel='Criterion', ylabel='Percent Error')

# Effects of Max Features
data = experiment_results['MaxFeatures'][0]
df = pd.DataFrame(data, columns=['NASA'], index=rf_experiments['MaxFeatures'])
chart = df.plot(title='Effects of Max Features', xlabel='Max Features', ylabel='Percent Error')

# Effects of Max Leaf Nodes
data = experiment_results['MaxLeafNodes'][0]
df = pd.DataFrame(data, columns=['NASA'], index=rf_experiments['MaxLeafNodes'])
chart = df.plot(title='Effects of Max Leaf Nodes', xlabel='# of Nodes', ylabel='Percent Error')

# Effects of Max Features
data = experiment_results['Trees'][0]
df = pd.DataFrame(data, columns=['NASA'], index=rf_experiments['Trees'])
chart = df.plot(title='Effects of Tree Count', xlabel='# of Trees', ylabel='Percent Error')

Observing the results of the experiments for the Random Forest Classifier, we see that there are several configurations that we can tune to make our model better. First, we look at the criterion. We see that entropy is much better than it's gini counterpart. The effects of max features configuration shows that we should configure near 0.4 or 0.7. Our max leaf nodes shows a significant dip near 200 nodes and our tree count is initially high but slowly levels out near 10 trees. 

**Nueral Network Experiments:**

For the Nueral Network analysis, we experimented with four different configuration settings: Learning Rate, Hidden Layers, Max Iterations and Momentum.

In [None]:
nn_experiments = { 
  'LearningRates' : [ 0.00001, 0.0000625, 0.0001, 0.000125, 0.00025, 0.0005, 0.001, 0.002, 0.004 ]
, 'HiddenLayers'  : [ 1, 5, 10, 20, 25, 50, 100, 200 ]
, 'MaxIterations' : [ 1, 5, 10 , 20, 25, 50, 100, 200, 400 ]
, 'Momentum'      : [ 0.001, 0.1, 0.25, 0.45, 0.9, 0.99, 0.999, 0.9999 ]
}

def generate_NN_test_models():
    all_models = { 'LearningRates' : [], 'HiddenLayers' : [], 'MaxIterations' : [], 'Momentum' : [] }
    experiment_results = { 'LearningRates' : [], 'HiddenLayers' : [], 'MaxIterations' : [], 'Momentum' : [] }
    
    for key in nn_experiments.keys():
        for setting in nn_experiments[key]:
            if ('LearningRates' == key):
                    all_models['LearningRates'].append(MLPClassifier(learning_rate_init=setting, max_iter=100))
            elif ('HiddenLayers' == key):
                    all_models['HiddenLayers'].append(MLPClassifier(hidden_layer_sizes=(setting,), max_iter=100))
            elif ('MaxIterations' == key):
                    all_models['MaxIterations'].append(MLPClassifier(max_iter=setting))
            elif ('Momentum' == key):
                    all_models['Momentum'].append(MLPClassifier(solver='sgd', momentum=setting, max_iter=100))
    
    return all_models, experiment_results
                

In [None]:
# NASA Models
all_models, experiment_results = generate_NN_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest))
    
# Effects of Learning Rate
data = experiment_results['LearningRates'][0]
df = pd.DataFrame(data, columns=['NASA'], index=nn_experiments['LearningRates'])
chart = df.plot(title='Effects of Learning Rate', xlabel='Learning Rate', ylabel='Percent Error')

# Effects of Hidden Layer
data = experiment_results['HiddenLayers'][0]
df = pd.DataFrame(data, columns=['NASA'], index=nn_experiments['HiddenLayers'])
chart = df.plot(title='Effects of Hidden Layers', xlabel='# of Hidden Layers', ylabel='Percent Error')

# Effects of Max Iterations
data = experiment_results['MaxIterations'][0]
df = pd.DataFrame(data, columns=['NASA'], index=nn_experiments['MaxIterations'])
chart = df.plot(title='Effects of Max Iterations', xlabel='# of Max Iterations', ylabel='Percent Error')

# Effects of Momentum
data = experiment_results['Momentum'][0]
df = pd.DataFrame(data, columns=['NASA'], index=nn_experiments['Momentum'])
chart = df.plot(title='Effects of Momentum', xlabel='Momentum', ylabel='Percent Error')

Analyzing the experiments, we see many areas of improvement for our Neural Network. First, we look at the learning rate. We run the default configuration and modify the learning rate only to show the effects of this configuration. Interesting enough, we see that as the learning rate increases, our percent error decreases so we will want to increase our learning rate to improve our model. Next, we see very similar relationships for our hidden layers and max iteration configurations. Both of which, decrease the percent error as we increase the value. We must proceed with caution though, in fear that we may overfit our data if the values are too high. Lastly, the most interesting experiment resrult was the effects of momentum. As you can see as we increase momentum, there is hardly any effect and then from 0.9 to 1.0 we see a rather drastic decrease in percent error. 

**k-Nearest Neighbor Experiments:**

For the KNN analysis, we experimented with two different configuration settings: Varying K-Neighbors and Weights.

In [None]:
knn_experiments = { 
  'VaryingK' : [ 1, 5, 10, 20, 50, 100, 200, 500, 1000 ]
, 'Weights'  : [ 'distance' ]
}

def generate_KNN_test_models():
    all_models = { 'VaryingK' : [], 'Weights' : [ KNeighborsClassifier(weights='distance') ] }
    experiment_results = { 'VaryingK' : [], 'Weights' : [] }
    
    for key in knn_experiments.keys():
        for setting in knn_experiments[key]:
            if ('VaryingK' == key):
                    all_models['VaryingK'].append(KNeighborsClassifier(n_neighbors=setting, weights='distance'))
    
    return all_models, experiment_results

In [None]:
# Nasa Models
all_models, experiment_results = generate_KNN_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest))
    
# Effects of K
data = experiment_results['VaryingK'][0]
df = pd.DataFrame(data, columns=['NASA'], index=knn_experiments['VaryingK'])
chart = df.plot(title='Effects of K', xlabel='K', ylabel='Percent Error')

# Weight: Uniform v Distance
data = [ baselineMap['NASA'][3], experiment_results['Weights'][0][0] ]
df = pd.DataFrame(data, columns=['NASA'], index=['Uniform', 'Distance'])
chart = df.plot.bar(rot=0, title='Uniform v Distance', xlabel='Weight', ylabel='Percent Error')

For our KNN experiments, we first observe the effects of K. We can see an initial dip and then a drastic increase in percent error as K increases. We identify the best K somewhere between 5 to 10. Our weight algorithm doesn't show much difference but we will go with distance to be consitent with our K test.

**SVC Experiments:**

For the SVC analysis, we experimented with four different configuration settings: Kernel, Degree, Gamma and Max Iterations.

In [None]:
svc_experiments = { 
  'Kernel'  : [ 'linear', 'poly', 'rbf', 'sigmoid' ]
, 'Degree'  : [ 1, 2, 3, 4, 5, 6, 7, 8 ]
, 'Gamma'   : [ 'auto' ]
, 'MaxIter' : [ 1, 5, 10 , 20, 25, 50, 100, 200, 400 ]
}

def generate_SVC_test_models():
    all_models = { 'Kernel' : [], 'Degree' : [ ], 'Gamma' : [ ], 'MaxIter' : [ ] }
    experiment_results = { 'Kernel' : [], 'Degree' : [ ], 'Gamma' : [ ], 'MaxIter' : [ ] }
    
    for key in svc_experiments.keys():
        for setting in svc_experiments[key]:
            if ('Kernel' == key):
                all_models['Kernel'].append(SVC(kernel=setting))
            elif ('Degree' == key):
                all_models['Degree'].append(SVC(kernel='poly', degree=setting))
            elif ('Gamma' == key):
                all_models['Gamma'].append(SVC(gamma=setting))
            elif ('MaxIter' == key):
                all_models['MaxIter'].append(SVC(max_iter=setting))
    
    return all_models, experiment_results

In [None]:
# Nasa Models
all_models, experiment_results = generate_SVC_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest))
    
# Effects of Degree
data = experiment_results['Degree'][0]
df = pd.DataFrame(data, columns=['NASA'], index=svc_experiments['Degree'])
chart = df.plot(title='Effects of Degree', xlabel='# of Degrees', ylabel='Percent Error')

# Effects of Max Iterations
data = experiment_results['MaxIter'][0]
df = pd.DataFrame(data, columns=['NASA'], index=svc_experiments['MaxIter'])
chart = df.plot(title='Effects of Max Iterations', xlabel='# of Max Iterations', ylabel='Percent Error')

# Effects of Kernels
data = [ baselineMap['NASA'][2] ]
for i in experiment_results['Kernel'][0]:
    data.append(i)
df = pd.DataFrame(data, columns=['NASA'], index=['base', 'linear', 'poly', 'rbf', 'sigmoid'])
chart = df.plot.bar(rot=0, title='Effects of Kernels', xlabel='Kernel Type', ylabel='Percent Error')

# Effects of Gamma
data = [ baselineMap['NASA'][2], experiment_results['Gamma'][0][0] ]
df = pd.DataFrame(data, columns=['NASA'], index=['Scale', 'Auto'])
chart = df.plot.bar(rot=0, title='Scale v Auto', xlabel='Gamma Type', ylabel='Percent Error')

Analyzing our experiments for our SVC model, we see that there are many interesting data points that will help improve our model. First, we look at the effects of the kernel, we want to pick a kernel that will perform well on our data and we see that our initial indications show 'poly' as the best. We then further this initial inidication with an experiment using the 'poly' kernel, but varying the degrees. We see that 6 degrees shows a much lower percent error! Next, we look at the gamma. We see that scale is much better than auto, but auto with 'poly' kernl is much better so that is how we will configure our final model.

# **Conclusion** #

Our experiments show many interesting data points that we can learn from to improve our models. First, we look at our Decision Tree experiments and conclude that the combination of Gini criterion, Best splitter and 0.9 max features produces a much more accurate model. Next, the Neural Network will use 0.004 as the Learning Rate, 200 Hidden Layers, 0.999 momentum and 400 max iterations. Our SVC model will use the poly kernel with 6 degrees, the Random Forest Boost model will uuse 10 trees with 200 nodes and entroy while utilizing max feature ratio of .4 and lastly our KNN model will use k=20 and distance as it's weight algorithm. We can see from the chart below that each model has greatly decreased it's percent error compared to it's baseline.

In [None]:
nasa_models = [ DecisionTreeClassifier(criterion='entropy', splitter='best', max_features=0.6), MLPClassifier(learning_rate_init=0.004, hidden_layer_sizes=200, max_iter=400, momentum=0.999), SVC(kernel='poly', degree=6), KNeighborsClassifier(n_neighbors=20, weights='distance'), RandomForestClassifier(n_estimators=10, max_leaf_nodes=200, max_features=0.4, criterion='entropy') ]
improved = simulate(nasa_models, nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest)

imprv_df = pd.DataFrame(improved, columns=['Improved'], index=['Decision Tree', 'Neural Network', 'SVC', 'KNN', 'Boosting']).join(nasa_df)
nasa_baseline_chart = imprv_df.plot.bar(rot=0, title='Model Baseline Error', xlabel='Models', ylabel='Percent Error')

Lastly, we take time to compare the models themselves. We see that the Neural Network had the largest improvement and overall our Random Forest is the most accurate. We can explain this by observing the data itself and noticing many of the features are continuous values. Decision Trees and Neural Networks would perform much better on this type of data because these models are best at finding clear splits in the data. SVC and KNN would not perform as well because the data is not naturally clumping together.

# Problem 2: Edible Mushroom Classification #

**Description**

Evolution allows for species to naturally extend their senses through biological changes. Now, machine learning can also provide a similar level of extension for humans by classifying edible foods. In a life-threatening situation, understanding what you can eat to survive can be the difference between life and death. Machine learning algorithms can help with this type of classification problem. We will start with a simpler problem by classifying which mushrooms are edible.

**Results**

Below, we describe the initial results of our models.  Each model is from the ’sklearn’ library.  Weused each model’s default settings as the initial results.  These default values are:
* DecisionTreeClassifier - Criterion=Entropy, Splitter=Best, 
* MLPClassifier - HiddenLayers=100, ActivationFunction=Relu, Solver=adam, Alpha=0.0001, LearningRate=Constant(0.001), MaxIterations=100 beta1=0.9, beta2=0.999, epsilon=1e-8
* SVC - kernel=rbf, degree=3, gamma= 1/(n\_features * X.var())
* KNeighborsClassifier - N-Neighbors=5, Weights=uniform
* RandomForestClassifier - Criterion=Entropy, MaxFeature=auto, MaxLeafNodes=None, TreeCount=100

In [None]:
# Shroom Models
shroom_models = [ DecisionTreeClassifier(criterion='entropy'), MLPClassifier(max_iter=100), SVC(), KNeighborsClassifier(), RandomForestClassifier() ]
baselineMap['MUSHROOM'] = simulate(shroom_models, shroom_xTrain, shroom_yTrain, shroom_xTest, shroom_yTest)

shroom_df = pd.DataFrame(baselineMap['MUSHROOM'], columns=['MUSHROOM'], index=['Decision Tree', 'Neural Network', 'SVC', 'KNN', 'Boosting'])
shroom_baseline_chart = shroom_df.plot.bar(rot=0, title='Model Baseline Error', xlabel='Models', ylabel='Percent Error')

In [None]:
pd.DataFrame(baselineMap['MUSHROOM'], columns=['MUSHROOM'], index=['Decision Tree', 'Neural Network', 'SVC', 'KNN', 'Boosting'])

# **Analysis** #

For the analysis, we will experiment with a variety of model configurations and analyze their effects on error. Ideally, we would like to find the best combination of configurations which produces the 'best' results. Best in the context of the Asteroid Classification problem will be maximizing the average test score from a cross validation with three folds.

**DecisionTree Experiments:**

For the Decision Tree analysis, we experimented with three different configuration settings:  Criterion, Max Features and Splitter Algorithm.

In [None]:
# Shroom Models
all_models, experiment_results = generate_DT_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], shroom_xTrain, shroom_yTrain, shroom_xTest, shroom_yTest))
    
# Criterion: Entropy v Gini
data = [ baselineMap['MUSHROOM'][0], experiment_results['Criterion'][0][0] ]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=['Entropy', 'Gini'])
chart = df.plot.bar(rot=0, title='Entropy v Gini', xlabel='Criterion', ylabel='Percent Error')

# Splitter: Best v Random
data = [ baselineMap['MUSHROOM'][0], experiment_results['Splitter'][0][0] ]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=['Best', 'Random'])
chart = df.plot.bar(rot=0, title='Best v Random', xlabel='Splitter', ylabel='Percent Error')

# Effects of Max Features
data = experiment_results['MaxFeatures'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=dt_experiments['MaxFeatures'])
chart = df.plot(title='Effects of Max Features', xlabel='Ratio of Max Features', ylabel='Percent Error')

# Effects of Max Leaf Nodes
data = experiment_results['MaxLeafNodes'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=dt_experiments['MaxLeafNodes'])
chart = df.plot(title='Effects of Max Leaf Nodes', xlabel='# of Nodes', ylabel='Percent Error')

The Decision Tree experiments show some interesting results. We notice that for this particular data set the max leaf nodes performance peaks at a much lower value than our previous experiments. We see a significant drop in percent error around 0.8 ratio of max features and unfortunately the difference between criterion and splitter algorithms didn't make much difference.

**Boosting Experiments:**

For the Random Forest classifier analysis, we experimented with three different configuration settings:  Criterion, Number of Trees, Max Features and Max Leaf Nodes.

In [None]:
# Nasa Models
all_models, experiment_results = generate_RF_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], nasa_xTrain, nasa_yTrain, nasa_xTest, nasa_yTest))

# Criterion: Entropy v Gini
data = [ baselineMap['MUSHROOM'][0], experiment_results['Criterion'][0][0] ]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=['Entropy', 'Gini'])
chart = df.plot.bar(rot=0, title='Entropy v Gini', xlabel='Criterion', ylabel='Percent Error')

# Effects of Max Features
data = experiment_results['MaxFeatures'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=rf_experiments['MaxFeatures'])
chart = df.plot(title='Effects of Max Features', xlabel='Max Features', ylabel='Percent Error')

# Effects of Max Leaf Nodes
data = experiment_results['MaxLeafNodes'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=rf_experiments['MaxLeafNodes'])
chart = df.plot(title='Effects of Max Leaf Nodes', xlabel='# of Nodes', ylabel='Percent Error')

# Effects of Max Features
data = experiment_results['Trees'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=rf_experiments['Trees'])
chart = df.plot(title='Effects of Tree Count', xlabel='# of Trees', ylabel='Percent Error')

For the Random Forest Classifier experiments, we notice a lot of great data points to improve our model. First we observe the criterion. Gini is significantly better than entropy for this data set so we will definitely configure our model to reflect this improvement. Next, we see a significant dip in max feature ratio around 0.5. Lastly, we see two significant dips around 500 nodes for max leaf nodes and 20 trees. 

**Nueral Network Experiments:**

For the Nueral Network analysis, we experimented with four different configuration settings: Learning Rate, Hidden Layers, Max Iterations and Momentum.

In [None]:
# Shroom Models
all_models, experiment_results = generate_NN_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], shroom_xTrain, shroom_yTrain, shroom_xTest, shroom_yTest))

# Effects of Learning Rate
data = experiment_results['LearningRates'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=nn_experiments['LearningRates'])
chart = df.plot(title='Effects of Learning Rate', xlabel='Learning Rate', ylabel='Percent Error')

# Effects of Hidden Layer
data = experiment_results['HiddenLayers'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=nn_experiments['HiddenLayers'])
chart = df.plot(title='Effects of Hidden Layers', xlabel='# of Hidden Layers', ylabel='Percent Error')

# Effects of Max Iterations
data = experiment_results['MaxIterations'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=nn_experiments['MaxIterations'])
chart = df.plot(title='Effects of Max Iterations', xlabel='# of Max Iterations', ylabel='Percent Error')

# Effects of Momentum
data = experiment_results['Momentum'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=nn_experiments['Momentum'])
chart = df.plot(title='Effects of Momentum', xlabel='Momentum', ylabel='Percent Error')

For our Neural Network experiments, we notice a few of the same results as our previous data set. As the learning rate increases the percent error greatly decreases. We see the more layers you have, the more procise the algorithm will be and momentum has a much more profound effect the closer we are to 1.0. Lastly, we observe the effects of max iterations. Just like the previous experiments we know that more iterations we give our algorithm to train, the better outcomes it will produce.

**k-Nearest Neighbor Experiments:**

For the KNN analysis, we experimented with two different configuration settings: Varying K-Neighbors and Weights.

In [None]:
# Shroom Models
all_models, experiment_results = generate_KNN_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], shroom_xTrain, shroom_yTrain, shroom_xTest, shroom_yTest))
    
# Effects of K
data = experiment_results['VaryingK'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=knn_experiments['VaryingK'])
chart = df.plot(title='Effects of K', xlabel='K', ylabel='Percent Error')

# Weight: Uniform v Distance
data = [ baselineMap['MUSHROOM'][3], experiment_results['Weights'][0][0] ]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=['Uniform', 'Distance'])
chart = df.plot.bar(rot=0, title='Uniform v Distance', xlabel='Weight', ylabel='Percent Error')

The KNN experiments for this classification problem was the most interesting out of all of the experiments because of how well it was able to predict. We see that the effects of K can be profound. When the values are too small then we know that the data set is more complex than just a few clusters. For this data set, we see that the distance algorithm and as K approaches 50 it achieves the lowest percent error.

**SVC Experiments:**

For the SVC analysis, we experimented with four different configuration settings: Kernel, Degree, Gamma and Max Iterations.

In [None]:
# Shroom Models
all_models, experiment_results = generate_SVC_test_models()
for key in all_models.keys():
    experiment_results[key].append(simulate(all_models[key], shroom_xTrain, shroom_yTrain, shroom_xTest, shroom_yTest))
    
# Effects of Degree
data = experiment_results['Degree'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=svc_experiments['Degree'])
chart = df.plot(title='Effects of Degree', xlabel='# of Degrees', ylabel='Percent Error')

# Effects of Max Iterations
data = experiment_results['MaxIter'][0]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=svc_experiments['MaxIter'])
chart = df.plot(title='Effects of Max Iterations', xlabel='# of Max Iterations', ylabel='Percent Error')

# Effects of Kernels
data = [ baselineMap['MUSHROOM'][2] ]
for i in experiment_results['Kernel'][0]:
    data.append(i)
df = pd.DataFrame(data, columns=['MUSHROOM'], index=['base', 'linear', 'poly', 'rbf', 'sigmoid'])
chart = df.plot.bar(rot=0, title='Effects of Kernels', xlabel='Kernel Type', ylabel='Percent Error')

# Effects of Gamma
data = [ baselineMap['MUSHROOM'][2], experiment_results['Gamma'][0][0] ]
df = pd.DataFrame(data, columns=['MUSHROOM'], index=['Scale', 'Auto'])
chart = df.plot.bar(rot=0, title='Scale v Auto', xlabel='Gamma Type', ylabel='Percent Error')

As we observe the SVC experiments, we notice that we are not seeing that much performance gain. The gamma type doesn't really give us a sense of which could drastically improve our model so we keep it at 'auto'. Next, we look at the kernel type. Many of them are within the 20-25 range, so there is still not a clear winner. We then decide to go with the 'poly' kernel and observe the effects of the varying degrees. When analyzing these variations, we notice that the higher degree produces the lowest percent error so we will decide to choose 8.

# **Conclusion** #
Our experiments show many interesting data points for all of our models. We noticed that our Decision Tree was much better with a max feature ratio around 0.8. This infers that we need to consider around 80% of our features to have the best outcome.  Next, we look at the Neural Network. We find many great data points that will improve our model. We decide to configure our learning rate to be 0.004, with 200 hidden layers, 100 max iterations and momentum of 0.999. Our SVC model showed a much better performance using a poly kernel with 8 degrees. Our Random Forest also showed some performance improvemnt using Gini criterion. Lastly, our KNN experiments show 50 is the best configuration for K and distance as the best weight algorithm. Below we can observe the improvements shown by tuning our models using what we've learned from our experiments:

In [None]:
shroom_models = [ DecisionTreeClassifier(max_features=0.8), MLPClassifier(learning_rate_init=0.004, hidden_layer_sizes=200, max_iter=100, momentum=0.999), SVC(kernel='poly', degree=8), KNeighborsClassifier(n_neighbors=50, weights='distance'), RandomForestClassifier(criterion='gini') ]
improved = simulate(shroom_models, shroom_xTrain, shroom_yTrain, shroom_xTest, shroom_yTest)

df = pd.DataFrame(improved, columns=['Improved'], index=['Decision Tree', 'Neural Network', 'SVC', 'KNN', 'Boosting']).join(shroom_df)
shroom_baseline_chart = df.plot.bar(rot=0, title='Model Baseline v Improved', xlabel='Models', ylabel='Percent Error')

Finally, we take time to compare and contrast our models and determine how well they classify edible mushrooms. We see that our SVC and Neural Network had the best performance improvements while the Decision Tree, Random Forest and KNN has a much lower percent error. We can explain this by looking at the data itself. We observe that many of the data points are discrete values for each of the features. If modified correctly during feature extraction, we can manipulate the data to have these discrete values cluster very well. This would work well for the KNN model and is why I believe the KNN model would be best for this type of classification problem. 