In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')

The first data set contains following features:
* Class Number
* Number of Animal Species in Class
* Class Type
* Animal Names

In [None]:
df = pd.read_csv('../input/zoo-animal-classification/class.csv')
df = pd.DataFrame(df)
df_1 = pd.read_csv('../input/zoo-animal-classification/zoo.csv')
df_1 = pd.DataFrame(df_1)

In [None]:
df.head()

In [None]:
df.describe()

The second data set is a summary of specific animal's characteristics. Based on these pieces of information we will be able to teach our model to predict the class of given animal.

In [None]:
df_1

Using data from df_1 we can prepare a correlation matrix and plot it as a heatmap.

In [None]:
df_corr = df_1.corr()

plt.figure(figsize=(15,15))
sn.heatmap(df_corr, annot=True)
plt.show()

Using df_1 data for Mammals, we can also prepare a correlation matrix for only Animal Type: Mammal. We drop the columns that are not rellevant for this animal type.

In [None]:
mammals = df_1[df_1['class_type'] == 1]
mammals = mammals.drop(columns =['backbone', 'breathes', 'venomous', 'feathers', 'milk', 'class_type'])
mammals_corr = mammals.corr()
mammals_corr

# **Model Building**

In [None]:
X = df_1.iloc[:, 1:17]
y = df_1.iloc[:, 17]

**1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2, stratify=y)

Testing the results for different C values.

In [None]:
logreg = LogisticRegression(C=1).fit(X_train, y_train)
print(f"The result for training set: {logreg.score(X_train, y_train)}")
print(f"The result for testing set: {logreg.score(X_test, y_test)}")

In [None]:
logreg100 = LogisticRegression(C=100).fit(X_train, y_train)
print(f"The result for training set: {logreg100.score(X_train, y_train)}")
print(f"The result for testing set: {logreg100.score(X_test, y_test)}")

In [None]:
logreg001 = LogisticRegression(C=0.01).fit(X_train, y_train)
print(f"The result for training set: {logreg001.score(X_train, y_train)}")
print(f"The result for testing set: {logreg001.score(X_test, y_test)}")

C value below 1 decrease the results of the Logistic Regression model for this data set.

Comparing the classification accuracy of the model based on actual values of Class Type.

In [None]:
logreg.predict(X)

In [None]:
np.array(y)

**2. Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2, stratify=y)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

In [None]:
print(f"The result for training set: {tree.score(X_train, y_train)}")
print(f"The result for testing set: {tree.score(X_test, y_test)}")

In [None]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)

In [None]:
print(f"The result for training set: {tree.score(X_train, y_train)}")
print(f"The result for testing set: {tree.score(X_test, y_test)}")

In [None]:
tree = DecisionTreeClassifier(min_samples_leaf=4, random_state=0)
tree.fit(X_train, y_train)

In [None]:
print(f"The result for training set: {tree.score(X_train, y_train)}")
print(f"The result for testing set: {tree.score(X_test, y_test)}")

In [None]:
tree = DecisionTreeClassifier(min_samples_leaf=6, random_state=0)
tree.fit(X_train, y_train)

In [None]:
print(f"The result for training set: {tree.score(X_train, y_train)}")
print(f"The result for testing set: {tree.score(X_test, y_test)}")

Higher value of min_samples_leaf parameter decreased the value of model fit and increased the value of model prediction ability.

In [None]:
from sklearn.tree import plot_tree, export_graphviz
plt.figure(figsize=(12,12))
plot_tree(tree)
plt.show()