# Your Name: Stephanie Buchanan

# import all packages 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import (train_test_split, GridSearchCV, 
                                     cross_val_score, validation_curve)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, zero_one_loss, classification_report, 
                             confusion_matrix, precision_score,recall_score, 
                             f1_score, plot_roc_curve, plot_confusion_matrix)
from sklearn.ensemble import RandomForestClassifier

# Data Prepocessing

My research question is: Are decision trees and random forest good models for predicting whether to use the autolander or not based on stable positiong, sign or error, wind sign wind strength and visability?  The outcome variable is 'use' and all others are inputs.

In [3]:
#dataset
shuttle = pd.read_csv('shuttle.csv', index_col= 0)
print(shuttle.dtypes)
print(shuttle.nunique())
#check if any data missing
shuttle.isnull().sum()

stability    object
error        object
sign         object
wind         object
magn         object
vis          object
use          object
dtype: object
stability    2
error        4
sign         2
wind         2
magn         4
vis          2
use          2
dtype: int64


stability    0
error        0
sign         0
wind         0
magn         0
vis          0
use          0
dtype: int64

# Data Splitting 

In [4]:
X_shuttle = pd.get_dummies(shuttle.drop(['use'], axis = 1))
y_shuttle = (shuttle.use == 'auto').astype(int)

X_shuttle_train, X_shuttle_test, y_shuttle_train, y_shuttle_test = train_test_split(X_shuttle, y_shuttle, 
                                                                                    test_size=0.30, 
                                                                                    random_state=42)

# Model Building and Evaluation 

In [5]:
dtree_shuttle = tree.DecisionTreeClassifier()
dtree_shuttle.fit(X_shuttle_train, y_shuttle_train)

forest_shuttle = RandomForestClassifier()
forest_shuttle.fit(X_shuttle_train, y_shuttle_train)

#hyperparamter tuning for decision tree model
param_grid_shuttle = {'max_depth':range(1,dtree_shuttle.tree_.max_depth), 'max_features': [8, 10, 12, 14, 16] }
grid_search_shuttle = GridSearchCV(dtree_shuttle, param_grid_shuttle, cv=8, scoring='accuracy',return_train_score=True)

grid_search_shuttle.fit(X_shuttle_test, y_shuttle_test)

GridSearchCV(cv=8, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 7),
                         'max_features': [8, 10, 12, 14, 16]},
             return_train_score=True, scoring='accuracy')

In [7]:
param_grid_shuttle_for = [{'n_estimators':[3,10,30], 'max_features':[8, 10, 12, 14, 16]},
                          {'bootstrap': [False], 'n_estimators':[3,10], 'max_features':[8, 10, 12, 14, 16]}]
grid_search_shuttle_for = GridSearchCV(forest_shuttle, param_grid_shuttle_for, cv=8, scoring='accuracy',return_train_score=True)
grid_search_shuttle_for.fit(X_shuttle_test, y_shuttle_test)

GridSearchCV(cv=8, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [8, 10, 12, 14, 16],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False],
                          'max_features': [8, 10, 12, 14, 16],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='accuracy')

In [8]:
best_dtree = grid_search_shuttle.best_estimator_
best_forest = grid_search_shuttle_for.best_estimator_

In [9]:
#accuracy for decision tree model
best_dtree.fit(X_shuttle_train, y_shuttle_train)
y_shuttle_train_pred = best_dtree.predict(X_shuttle_train)
y_shuttle_test_pred = best_dtree.predict(X_shuttle_test)


dt_accuracy_train = accuracy_score(y_shuttle_train, y_shuttle_train_pred)
dt_accuracy_test = accuracy_score(y_shuttle_test, y_shuttle_test_pred)
print('best tree model accuracy on training set: ', round(dt_accuracy_train, 5) )
print('best tree model accuracy on test set: ', round(dt_accuracy_test,5))

best tree model accuracy on training set:  0.95531
best tree model accuracy on test set:  0.98701


In [10]:
#accuracy for random forest model
best_forest.fit(X_shuttle_train, y_shuttle_train)
y_shuttle_train_pred_for = best_forest.predict(X_shuttle_train)
y_shuttle_test_pred_for = best_forest.predict(X_shuttle_test)


for_accuracy_train = accuracy_score(y_shuttle_train, y_shuttle_train_pred_for)
for_accuracy_test = accuracy_score(y_shuttle_test, y_shuttle_test_pred_for)
print('best random forest model accuracy on training set: ', round(for_accuracy_train, 5) )
print('best random forest model accuracy on test set: ', round(for_accuracy_test,5))

best random forest model accuracy on training set:  1.0
best random forest model accuracy on test set:  0.98701


# Conclusion 

Are decision trees and random forest good models for predicting whether to use the autolander or not based on stable positiong, sign or error, wind sign wind strength and visability?  The data was sourced from https://vincentarelbundock.github.io/Rdatasets/articles/data.html and contains 6 variables and one categorical outcome.  

A decision tree and random forest model were fit to the training data after splitting into train/test.   Both models had acceptable accuracy on the train and test sets with the random forest model resulting in the higher accuracy, although overfitting may be present.

There does not seem to be overfitting in the decision tree model as it have a 95.5% accuracy on the training set and 98.7% accuracy on the test set.  The accuracy of the random forest model is 100%, so this is an indication of overfitting.  The accuracy on the test set for the random forest is 98.7%, which is also high, so overfitting is not a big concern. The random forest model is the better model for classification for this dataset since it had the higher accuracy on the training set and the accuracy on the test set was the same for both the decision tree and random forest models.  