In [1]:
# Import the important libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
animal = pd.read_csv('archive/horse/horse.csv')
animal.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [3]:
# Select the target variable
target = animal['outcome']
target.head()

0          died
1    euthanized
2         lived
3          died
4          died
Name: outcome, dtype: object

In [4]:
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [5]:
target.value_counts()

lived         178
died           77
euthanized     44
Name: outcome, dtype: int64

In [6]:
# Drop the outcome column in the animal dataset
animal.drop('outcome', axis=1, inplace=True)

In [7]:
# Types of features in dataset
animal.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
dtype: object

In [8]:
categorical_data = animal.loc[:, animal.dtypes == 'object']
categorical_data.head()

Unnamed: 0,surgery,age,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,abdomen,abdomo_appearance,surgical_lesion,cp_data
0,no,adult,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,decreased,distend_large,,no,no
1,yes,adult,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,absent,other,cloudy,no,no
2,no,adult,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,normal,normal,,no,yes
3,yes,young,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,decreased,,serosanguious,yes,yes
4,no,adult,,,dark_cyanotic,more_3_sec,,,,,,,,,no,no


In [9]:
# Transform in dummies
animal = pd.concat([animal, pd.get_dummies(categorical_data)], axis=1)
animal.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,0,0,0,0,0,0,1,0,1,0
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,0,0,1,0,1,0,1,0,1,0
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,0,1,0,0,0,0,1,0,0,1
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,0,0,0,0,0,1,0,1,0,1
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,0,0,0,0,0,0,1,0,1,0


In [10]:
animal.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3',
       'cp_data', 'surgery_no', 'surgery_yes', 'age_adult', 'age_young',
       'temp_of_extremities_cold', 'temp_of_extremities_cool',
       'temp_of_extremities_normal', 'temp_of_extremities_warm',
       'peripheral_pulse_absent', 'peripheral_pulse_increased',
       'peripheral_pulse_normal', 'peripheral_pulse_reduced',
       'mucous_membrane_bright_pink', 'mucous_membrane_bright_red',
       'mucous_membrane_dark_cyanotic', 'mucous_membrane_normal_pink',
       'mucous_membrane_pale_cyanotic'

In [11]:
# Drop the categorical data type columns
animal.drop(columns=categorical_data.columns, inplace=True)

In [12]:
# Target data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
X, y = animal.values, target.values

In [14]:
# Instanciate the encoder
label_encoder = LabelEncoder()

In [15]:
# Encoding the target data
y = label_encoder.fit_transform(y)
y[:5]

array([0, 1, 2, 0, 0])

In [16]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [17]:
# Checking the shape of data
print("X train shape: ", X_train.shape)
print("X test shape: ", X_test.shape)
print("y train shape: ", y_train.shape)
print("y test shape: ", y_test.shape)

X train shape:  (209, 67)
X test shape:  (90, 67)
y train shape:  (209,)
y test shape:  (90,)


In [18]:
# Loading the Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Imputing the null value in train and test data
from sklearn.impute import SimpleImputer
import numpy as np

In [20]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [21]:
# Impute the nan values
X_train = imp.fit_transform(X_train)
X_test  = imp.fit_transform(X_test)

In [24]:
# Building the model
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [25]:
# Making prediction
y_predicted = classifier.predict(X_test)

In [26]:
y_predicted[:5]

array([0, 0, 2, 1, 1])

In [27]:
# Checking the performance of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [28]:
# Accuracy of model
accuracy_score(y_test, y_predicted)

0.6444444444444445

In [29]:
# Confusion matrix
confusion_matrix(y_test, y_predicted)

array([[16,  3,  9],
       [ 0,  6,  3],
       [12,  5, 36]])

In [30]:
# Build the model with Random Forest Classifier 
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [31]:
y_pred = classifier.predict(X_test)
y_pred[:10]

array([2, 0, 2, 2, 2, 2, 2, 2, 2, 2])

In [32]:
# Accuracy with Random Forest Classifier
accuracy_score(y_test, y_pred)

0.7444444444444445

In [34]:
# Make report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.50      0.64        28
           1       0.40      0.22      0.29         9
           2       0.74      0.96      0.84        53

    accuracy                           0.74        90
   macro avg       0.67      0.56      0.59        90
weighted avg       0.75      0.74      0.72        90

