#Attempt No. 01#

---

#Inspiration#
+ What are the best machine learning ensembles/methods for classifying the animals based upon the variables given?

#Dataset#
+ The dataset consists of 101 animals from a zoo. 
+ There are 16 variables with various traits to describe the animals. 
+ The 7 Class Types are: Mammal, Bird, Reptile, Fish, Amphibian, Bug and Invertebrate

##zoo.csv##
+ This csv contains the dataset.
+ Attribute Information: (name of attribute and type of value domain)
> + animal_name: Unique for each instance
> + hair Boolean
> + feathers Boolean
> + eggs Boolean
> + milk Boolean
> + airborne Boolean
> + aquatic Boolean
> + predator Boolean
> + toothed Boolean
> + backbone Boolean
> + breathes Boolean
> + venomous Boolean
> + fins Boolean
> + legs Numeric (set of values: {0,2,4,5,6,8})
> + tail Boolean
> + domestic Boolean
> + catsize Boolean
> + class_type Numeric (integer values in range [1,7])

##class.csv##
+ This csv describes the dataset.
> + Class_Number Numeric (integer values in range [1,7])
> + Number_Of_Animal_Species_In_Class Numeric
> + Class_Type character -- The actual word description of the class
> + Animal_Names character -- The list of the animals that fall in the category of the class

#Acknowledgements#
+ UCI Machine Learning: https://archive.ics.uci.edu/ml/datasets/Zoo

## Load libraries ##

In [None]:
import seaborn
import numpy
import sys

from pandas import read_csv
from pandas import set_option
from matplotlib import pyplot

from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

## Check the available data files ##

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

## Load data ##

In [None]:
# load file [class.csv] into dataframe [_df_class]
_df_class = read_csv('../input/class.csv')

# load file [zoo.csv] into dataframe [_df_zoo]
_df_zoo = read_csv('../input/zoo.csv')

## Analyze data - descriptive statistics ##

In [None]:
# first 5 rows of dataframe [_df_class]
_df_class.head()

In [None]:
# first 5 rows of dataframe [_df_zoo]
_df_zoo.head()

**As dataset [_df_class] seems to be the explanation for column [class_type] of dataset [_df_zoo], we'll focus on dataset [df_zoo]**

In [None]:
# (rows, cols) of dataframe [_df_zoo]
_df_zoo.shape

In [None]:
# data types
_df_zoo.dtypes

**Apart from [animal_name], all other columns are numeric, we might not need much data preprocessing**

In [None]:
set_option('precision',2)

In [None]:
_df_zoo.describe()

In [None]:
_df_zoo.corr(method='pearson')

In [None]:
# class distribution
_df_zoo.groupby('class_type').size()

**Classes are not balanced**

## Visualize data - uni-variate ##

In [None]:
_df_zoo.plot(kind='density', subplots=True, layout=(4,5), figsize=(13,20), sharex=False, sharey=False)
pyplot.show()

In [None]:
_df_zoo.plot(kind='box', subplots=True, layout=(4,5), figsize=(13,20), sharex=False, sharey=False)
pyplot.show()

## Visualize data - multi-variate ##

In [None]:
def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = pyplot.subplots( figsize =( 14 , 12 ) )
    cmap = seaborn.diverging_palette( 220 , 10 , as_cmap = True )
    _ = seaborn.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

In [None]:
plot_correlation_map(_df_zoo)

**Correlated (>.80) features:**

+ hair/milk 0.88
+ hair/eggs -0.82
+ eggs/milk -0.94

**Removing features hair and eggs**

In [None]:
_df_zoo = _df_zoo.drop('hair',axis=1)
_df_zoo = _df_zoo.drop('eggs',axis=1)

In [None]:
plot_correlation_map(_df_zoo)

##Prepare data##

**Removing the non-numeric column [animal_name] from the dataframe**

In [None]:
# column [animal_name] has to be dropped from the dataframe
# if we change the full dataframe to array and then exclude it by _array[:,1:17]
# numpy considers the data type inside the array to be object instead of int64
_df_zoo = _df_zoo.drop('animal_name', axis=1)
_df_zoo.head()

In [None]:
_array = _df_zoo.values

In [None]:
print(_array[:10,:], len(_array), type(_array), _array.shape, _array.ndim, _array.dtype.name)

In [None]:
_X = _array[:,0:14]

In [None]:
print(_X[:10,:], len(_X), type(_X), _X.shape, _X.ndim, _X.dtype.name)

**[Removing features with low variance](http://scikit-learn.org/stable/modules/feature_selection.html)**

For all our boolean features, removing all features that are either one or zero in more than 80% of the samples.


In [None]:
_X = VarianceThreshold(threshold=(.8*(1-.8))).fit_transform(_X)

In [None]:
print(_X[:10,:], len(_X), type(_X), _X.shape, _X.ndim, _X.dtype.name)

In [None]:
_y = _array[:,14:]

In [None]:
print(_y[:5], len(_y), type(_y), _y.shape, _y.ndim, _y.dtype.name)

In [None]:
_y = numpy.ravel(_y)

In [None]:
print(_y[:5], len(_y), type(_y), _y.shape, _y.ndim, _y.dtype.name)

**Splitting data into training and test sets**

In [None]:
_test_size = 0.20

In [None]:
_random_seed = 7

In [None]:
X_train, X_test, y_train, y_test = train_test_split(_X, _y, test_size=_test_size, random_state=_random_seed)

In [None]:
print(X_train, len(X_train), type(X_train), X_train.shape, X_train.ndim, X_train.dtype.name)

In [None]:
print(X_test, len(X_test), type(X_test), X_test.shape, X_test.ndim, X_test.dtype.name)

In [None]:
print(y_train, len(y_train), type(y_train), y_train.shape, y_train.ndim, y_train.dtype.name)

In [None]:
print(y_test, len(y_test), type(y_test), y_test.shape, y_test.ndim, y_test.dtype.name)

##Evaluate algorithms - baseline##

In [None]:
_num_folds = 10

In [None]:
_scoring = 'accuracy'

**Spot check algorithms**

In [None]:
_models = []

# linear algorithms
_models.append(('LR', LogisticRegression())) 
_models.append(('LDA', LinearDiscriminantAnalysis())) 

# non-linear algorithms
_models.append(('KNN', KNeighborsClassifier())) 
_models.append(('CART', DecisionTreeClassifier())) 
_models.append(('NB', GaussianNB())) 
_models.append(('SVM', SVC()))

In [None]:
_results = []
_names = []

for _name, _model in _models:
    _kfold = KFold(n_splits=_num_folds, random_state=_random_seed)
    _cv_results = cross_val_score(_model, X_train, y_train, cv=_kfold, scoring=_scoring)
    _results.append(_cv_results)
    _names.append(_name)
    _msg = '{}: {:.3%}, {:.3f}'.format(_name, _cv_results.mean(), _cv_results.std())
    print(_msg)

In [None]:
# compare algorithms
fig = pyplot.figure() 
fig.suptitle('Algorithm Comparison') 
ax = fig.add_subplot(111) 
pyplot.boxplot(_results) 
ax.set_xticklabels(_names) 
pyplot.show()

## Ensemble Methods ##

In [None]:
# ensembles
ensembles = []

# boosting methods
ensembles.append(('AB', AdaBoostClassifier())) 
ensembles.append(('GBM', GradientBoostingClassifier())) 

# bagging methods
ensembles.append(('RF', RandomForestClassifier())) 
ensembles.append(('ET', ExtraTreesClassifier()))

In [None]:
_results_en = []
_names_en = []

for _name, _model in ensembles:
    _kfold = KFold(n_splits=_num_folds, random_state=_random_seed)
    _cv_results = cross_val_score(_model, X_train, y_train, cv=_kfold, scoring=_scoring)
    _results_en.append(_cv_results)
    _names_en.append(_name)
    _msg = '{}: {:.3%}, {:.3f}'.format(_name, _cv_results.mean(), _cv_results.std())
    print(_msg)

In [None]:
# compare algorithms
fig = pyplot.figure() 
fig.suptitle('Ensemble Algorithm Comparison') 
ax = fig.add_subplot(111) 
pyplot.boxplot(_results_en) 
ax.set_xticklabels(_names_en) 
pyplot.show()

##Finalize Model##

In [None]:
# prepare final model - Gradient Boosting Classifier

_model_final_a = GradientBoostingClassifier()
_model_final_a.fit(X_train, y_train)

# estimate accurary on test data

_predictions = _model_final_a.predict(X_test)
print(accuracy_score(y_test, _predictions))
print(confusion_matrix(y_test, _predictions))
print(classification_report(y_test, _predictions))

In [None]:
# prepare final model - ExtraTreesClassifier

_model_final_b = ExtraTreesClassifier()
_model_final_b.fit(X_train, y_train)

# estimate accurary on test data

_predictions = _model_final_b.predict(X_test)
print(accuracy_score(y_test, _predictions))
print(confusion_matrix(y_test, _predictions))
print(classification_report(y_test, _predictions))