##Analyzing the dataset and a survey of some popular ml techniques applied on it ##

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
df1 = pd.read_csv('../input/class.csv')
df2 = pd.read_csv('../input/zoo.csv')
print(df1.describe())
print(df2.describe())

In [None]:
df1.head()

In [None]:
df2.head()

Seeing the data, class.csv contains the details of the class like mammals, amphibians, etc. alongwith number of animals in this class, its code and the names of the animals

animal.csv contains the details of the animals in a 1/0 format suggesting if a particular feature like hair / eggs / milk, etc. is present or not alongwith the class type - we actually only need animal.csv to make a prediction

## Trying to understand the corelations and PCA within the features ##
First let us try to see if there's any correlation within the attributes and also try to do a PCA

In [None]:
correlation = df2.corr()
f, ax = plt.subplots(figsize=(9, 9))
plt.rcParams.update({'font.size': 8})
sns.heatmap(correlation, vmax=1,annot=True,cmap='cubehelix')

## Few observations from the heatmap of correlation matrix ##
There is a very strong correlation between the following types of features:

 1. Having hair means it is very unlikely that the animal lays eggs & very likely that it gives milk
 2. Having feathers has a relatively high correlation with the animal being  airborne and low correlation with it having teeth
 3. If the animal lays eggs, it is **very unlikely** that it gives milk, similar low correlation for having teeth
 4. If it is aquatic relatively high corrrelation for having fins & so on

In [None]:
a = df2.iloc[:,1:17].values
b = df2.iloc[:,17].values
print(a[2],b[2])

In [None]:
#Classifiying data and target
X = df2.iloc[:,1:17].values   # Not considering the name of the animal - placing it 
y = df2.iloc[:,17].values     # Class number to be assigned (labels)

# Separating into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# Have to include cross validation here as the sample size is too less

In [None]:
b = df2.columns.values
index = [0,17]
feature_names = np.delete(b,index)
feature_names

In [None]:
target_names = np.array(df1['Class_Type'])
target_names

In [None]:
#Standardizing data

from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X_train)

## PCA ##
Trying to reduce the number of features considered here - with PCA

In [None]:
#pca = decomposition.PCA(n_components=3)
#pca.fit(X)
#X = pca.transform(X)

## Trying different models ##


## K-NN ##

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

## Gaussian Naive Bayes ##

In [None]:
# Fitting a Naive Bayes model to the data
model = GaussianNB()
model.fit(X_train, y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

## SVM ##

In [None]:
from sklearn import svm
model = svm.LinearSVC()
model.fit(X_train, y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

## Decision Trees ##

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train,y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model

print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

Trying to print the decision tree below:

In [None]:
from IPython.display import Image
from sklearn import tree
#import pydot
import pydotplus as pydot

dot_data = tree.export_graphviz(model, out_file=None, 
                         feature_names=feature_names,  
                         class_names=target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
#graph = pydot.graph_from_dot_data(dot_data)  
#Image(graph.create_png())  

## Random Forest ##

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

It seems that most models are performing poorly for reptiles and amphibians (Which makes sense as they are kind of similar) lets try looking at the dataset for both of these and try coming up with some form of PCA

In [None]:
df3 = df2.loc[df2['class_type'].isin([3,5])]
df3

In [None]:
correlation = df3.corr()
f, ax = plt.subplots(figsize=(9, 9))
plt.rcParams.update({'font.size': 8})
sns.heatmap(correlation, vmax=1,annot=True,cmap='cubehelix')

In [None]:
Applying feature Engineering

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)

# display the relative importance of each attribute
print("Importance percentages of each attribute" + "\n" + "------------")
for i in range(0,feature_names.shape[0]):
    print('%s\t%f' %(feature_names[i],model.feature_importances_[i] * 100) + '%')
    
from sklearn.feature_selection import RFE
rfe = RFE(model, 10)
rfe = rfe.fit(X, y)

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

# Top 10 features
print("Selecting the following features:")
print(feature_names[rfe.support_ == True])

df_new = df2.drop('animal_name',axis=1)
for i in feature_names[rfe.support_ == False]:
    df_new = df_new.drop(i, axis=1)
    print ("Removing", i)
print(df_new)

#Classifiying data and target
X = df_new.iloc[:,0:10].values   # Not considering the name of the animal - placing it 
y = df_new.iloc[:,10].values     # Class number to be assigned (labels)

# Separating into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

## K-NN ##

For updated training and test data sets

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
from sklearn import grid_search
parameters = {'max_depth':range(3,20)}
clf = grid_search.GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=-1)
clf.fit(X=X, y=y)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

## Gaussian Naive Bayes ##

For dataset with updated features

In [None]:
# Fitting a Naive Bayes model to the data
model = GaussianNB()
model.fit(X_train, y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

## SVM ##
For dataset with updated features

In [None]:
from sklearn import svm
model = svm.LinearSVC()
model.fit(X_train, y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

## Decision Tree ##
For dataset with updated features

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train,y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model

print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))

##Random Forest##
For dataset with updated features

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

print(model)

# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names = target_names))
print(metrics.confusion_matrix(expected, predicted))