In [1]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Disabling warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data
zoo_data = pd.read_csv('zoo.csv')

class_lookup = pd.read_csv('class.csv')
class_lookup = class_lookup[['Class_Number', 'Class_Type']].set_index('Class_Number')

In [4]:
zoo_data.head(10)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


In [5]:
class_lookup

Unnamed: 0_level_0,Class_Type
Class_Number,Unnamed: 1_level_1
1,Mammal
2,Bird
3,Reptile
4,Fish
5,Amphibian
6,Bug
7,Invertebrate


In [6]:
# Checking if there are any missing values in zoo data
print(zoo_data.isnull().sum())

animal_name    0
hair           0
feathers       0
eggs           0
milk           0
airborne       0
aquatic        0
predator       0
toothed        0
backbone       0
breathes       0
venomous       0
fins           0
legs           0
tail           0
domestic       0
catsize        0
class_type     0
dtype: int64


`Since all the features in the dataset are already in a format that the machine learning models can understand (numerical), and with no missing values, feature engineering may not be necessary`

In [7]:
# Prepare the data
X = zoo_data.iloc[:,1:-1]  # features
y = zoo_data.class_type    # target

In [8]:
# Prepare the training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [9]:
# Train RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train,y_train)

In [10]:
# Predict on test set
y_pred = model_rf.predict(X_test)

In [11]:
# Print accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}%")

Accuracy: 96.7741935483871%


In [12]:
# Print Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[15  0  0  0  0  0  0]
 [ 0  5  0  0  0  0  0]
 [ 0  0  1  0  0  0  0]
 [ 0  0  0  3  0  0  0]
 [ 0  0  1  0  0  0  0]
 [ 0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  3]]


In [13]:
# Print some predicted samples
print("Some sample predictions:")
sample_zoo_data = X_test.sample(5)

# Get corresponding y_test values
sample_actual_class = y_test[sample_zoo_data.index]

# Predict
sample_zoo_data['predicted'] = model_rf.predict(sample_zoo_data)

# Add actual classes from y_test
sample_zoo_data['actual'] = sample_actual_class

# Replace predicted and actual classes with their respective names
sample_zoo_data['predicted_class'] = sample_zoo_data['predicted'].replace(class_lookup['Class_Type'])
sample_zoo_data['actual_class'] = sample_zoo_data['actual'].replace(class_lookup['Class_Type'])

print(sample_zoo_data[['actual', 'actual_class', 'predicted', 'predicted_class']])

Some sample predictions:
    actual  actual_class  predicted predicted_class
46       7  Invertebrate          7    Invertebrate
67       1        Mammal          1          Mammal
17       1        Mammal          1          Mammal
81       7  Invertebrate          7    Invertebrate
66       1        Mammal          1          Mammal


In [14]:
# Check the accuracy across different subsets of data
from sklearn.model_selection import cross_val_score

# Perform K-Fold Cross Validation
scores = cross_val_score(model_rf, X, y, cv=5)

print("Accuracy on cross validation set: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy on cross validation set: 0.98 (+/- 0.05)


In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate predictions on the test set
y_pred = model_rf.predict(X_test)

# Precision: quality of the prediction
precision = precision_score(y_test, y_pred, average='macro')

# Recall: the completeness - how much the model could correctly identify
recall = recall_score(y_test, y_pred, average='macro')

# F1 score: balance between precision and recall
f1 = f1_score(y_test, y_pred, average='macro')

print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)

Precision:  0.7857142857142857
Recall:  0.8571428571428571
F1 Score:  0.8095238095238094
