# Horse-Colic Dataset from UCI Machine Learning Repository

### The dataset has 23 features and has a good mix of categorical and continuous features. It has a large number of features and instances with missing values, therefore understanding methods to replace these missing values and using it in modeling is made more practical in this treatment. Huge fraction of missing data (30%) is in fact a notable feature of this dataset.
### The data consists of attributes that are continuous, as well as categorical in type. 
### Also, the presence of self-predictors makes working with this dataset instructive from a practical standpoint.

## Importing Train data 

In [None]:
import pandas as pd
columns = ['surgery','Age','Hospital ID','rectal temperature','pulse','respiratory rate','temperature of extremities','peripheral pulse',"mucous membranes",'capillary refill time','pain','peristalsis','abdominal distension','nasogastric tube','nasogastric reflux','nasogastric reflux PH','rectal examination - feces','abdomen',' packed cell volume','total protein','abdominocentesis appearance','abdomcentesis total protein','outcome','surgical lesion?','1','2','3','path']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data', sep = ' ', header = None, na_values='?', names = columns)

In [None]:
df.head()

In [None]:
len(df.columns)

In [None]:
df.info()

## Cleaning Train dataset

In [None]:
#Dropping columns having significant number of NaN values
df.drop('nasogastric reflux PH', axis=1, inplace = True)
df.drop('abdomcentesis total protein', axis=1, inplace = True)
df.drop('abdominocentesis appearance', axis=1, inplace = True)
df.drop('nasogastric tube', axis=1, inplace = True)
df.drop('abdomen', axis=1, inplace = True)
df.drop('rectal examination - feces', axis=1, inplace = True)

In [None]:
df['Age'].replace({9:2}, inplace = True)
df['outcome'].replace({1:'lived', 2:'died', 3:'euthenized'}, inplace = True)
df['Age'].replace({1:'Adult', 2:'Young'}, inplace = True)
df['pain'].replace({1:'No Pain', 2:'Depressed', 3:'Mild pain', 4:'Severe Pain', 5:'Greaterthansevere'}, inplace = True)

In [None]:
#Heatmap for missing values
import seaborn as sns
sns.heatmap(df.isnull(), cbar=False)

In [None]:
#BarGraph of missing values for clearer visualisation
null = df.isnull().sum()

from matplotlib import pyplot as plt

plt.figure(figsize=(20,10))
plt.bar(range(len(null)),null)
plt.xlabel('Features')
plt.ylabel('missing')
plt.xticks(list(range(len(df.columns))), list(df.columns.values), rotation='vertical')
plt.show

print(null)

In [None]:
#Columns containing NaN values
null = pd.concat([df.isnull().sum()], axis = 1, keys = ['NA'] )
null.loc[(null.NA>0)]

In [None]:
#Filtering columns having continous variables 
for col in df.columns.values:
    if (len(df[col].value_counts())> 5) and (df[col].isnull().sum() > 0):
        print(col)

In [None]:
#Filling columns containing continous variables with MEAN
df['surgery'].fillna(df['surgery'].mode()[0], inplace = True)
df['rectal temperature'].fillna(df['rectal temperature'].mean(), inplace = True)
df['pulse'].fillna(df['pulse'].mean(), inplace = True)
df['respiratory rate'].fillna(df['pulse'].mean(), inplace = True)
df['mucous membranes'].fillna(df['pulse'].mean(), inplace = True)
df[' packed cell volume'].fillna(df['pulse'].mean(), inplace = True)
df['total protein'].fillna(df['total protein'].mean(), inplace = True)

##Filling columns containing categorical variables with MODE
for col in df.columns.values:
    if (df[col].isnull().sum()>0):  
        if (df[col].dtype == 'float64'):
            df[col].fillna(df[col].mode()[0], inplace = True)

In [None]:
null = pd.concat([df.isnull().sum()], axis = 1, keys = ['NA'])
null.loc[(null['NA']>0)]               

# For a better insight and efficiency let's find correlation between features and "outcome" which is our target variable

In [None]:

import seaborn as sns
sns.countplot(data=df, x="outcome");
print(df.outcome.value_counts())

In [None]:
#Finding relations between features and taget variable(outcome) by visualising data
#Relation between outcome and degree of pain experienced by the horse
sns.countplot(data=df, x='outcome', hue = 'pain')


In [None]:
#Relation between outcome and degree of age of the horse

sns.countplot(data=df, x='outcome', hue = 'Age')

## Data Processing for analysis

In [None]:
#Converting variables to categorical datatype for better analysis
df['outcome'] = df['outcome'].astype('category').cat.codes
df['Age'] = df['Age'].astype('category').cat.codes
df['pain'] = df['pain'].astype('category').cat.codes

In [None]:
#Finding correlation between features and target variables
corr= df.corr()

In [None]:
core = abs(corr.outcome.sort_values(ascending = False))
core.sort_values(ascending = False)

In [None]:
#Heatmap of correlations
sns.heatmap(corr, vmax=0.85)

## This shows that features like pulse, surgical lesion,packed cell volume are having strong relations with our target variable which implies that these features play the most crucial role in classifying the target variable compared to other features. 

In [None]:
#Removing columns which have insignificant impact on our target variable "outcome" (corr<0.1)
df.drop('path',axis = 1 , inplace = True)
df.drop('3',axis = 1 , inplace = True)
df.drop('rectal temperature',axis = 1 , inplace = True)
df.drop('pain',axis = 1 , inplace = True)
df.drop('mucous membranes',axis = 1 , inplace = True)

In [None]:
#Cleaned and processed train dataset
display(df.head())
display(df.shape)

## Importing Test Dataset

In [None]:
columns = ['surgery','Age','Hospital ID','rectal temperature','pulse','respiratory rate','temperature of extremities','peripheral pulse',"mucous membranes",'capillary refill time','pain','peristalsis','abdominal distension','nasogastric tube','nasogastric reflux','nasogastric reflux PH','rectal examination - feces','abdomen',' packed cell volume','total protein','abdominocentesis appearance','abdomcentesis total protein','outcome','surgical lesion?','1','2','3','path']
dfs = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.test', sep = ' ', header = None, na_values='?', names = columns)

In [None]:
display(dfs.head())
display(len(dfs.columns))
display(dfs.info())
display(dfs.shape)

## Cleaning Test dataset

In [None]:
#Visualising missing data
sns.heatmap(dfs.isnull(),cbar=False, cmap='Pastel1')

## White blank spaces account for the missing data in the entire dataframe represented by Pink

In [None]:
null = dfs.isnull().sum()

from matplotlib import pyplot as plt

plt.figure(figsize=(20,10))
plt.bar(range(len(null)),null)
plt.xlabel('Features')
plt.ylabel('missing')
plt.xticks(list(range(len(dfs.columns))), list(dfs.columns.values), rotation='vertical')
plt.show

print(null)

In [None]:
#Dropping columns with significant number of missing values
dfs.drop('nasogastric reflux PH', axis=1, inplace = True)
dfs.drop('abdomcentesis total protein', axis=1, inplace = True)
dfs.drop('abdominocentesis appearance', axis=1, inplace = True)
dfs.drop('nasogastric tube', axis=1, inplace = True)
dfs.drop('abdomen', axis=1, inplace = True)
dfs.drop('rectal examination - feces', axis=1, inplace = True)

Removing columns which have insignificant impact on our target variable "outcome" (corr<.9) as seen from Train dataset


In [None]:
dfs.drop('path',axis = 1 , inplace = True)
dfs.drop('3',axis = 1 , inplace = True)
dfs.drop('rectal temperature',axis = 1 , inplace = True)
dfs.drop('pain',axis = 1 , inplace = True)
dfs.drop('mucous membranes',axis = 1 , inplace = True)

# Processing Test dataset before analysis

In [None]:
dfs['Age'].replace({9:2}, inplace = True)
dfs['outcome'].replace({1:'lived', 2:'died', 3:'euthenized'}, inplace = True)
dfs['Age'].replace({1:'Adult', 2:'Young'}, inplace = True)


In [None]:
null = pd.concat([dfs.isnull().sum()], axis = 1, keys = ['NA'] )
null.loc[(null.NA>0)]

In [None]:
for col in dfs.columns.values:
    if (len(dfs[col].value_counts())> 5) and (dfs[col].isnull().sum() > 0):
        print(col)

In [None]:
dfs['surgery'].fillna(dfs['surgery'].mode()[0], inplace = True)
dfs['pulse'].fillna(dfs['pulse'].mean(), inplace = True)
dfs['respiratory rate'].fillna(dfs['pulse'].mean(), inplace = True)
dfs[' packed cell volume'].fillna(dfs['pulse'].mean(), inplace = True)
dfs['total protein'].fillna(dfs['total protein'].mean(), inplace = True)

for col in dfs.columns.values:
    if (dfs[col].isnull().sum()>0):  
        if (dfs[col].dtype == 'float64'):
            dfs[col].fillna(df[col].mode()[0], inplace = True)

In [None]:
null = pd.concat([dfs.isnull().sum()], axis = 1, keys = ['NA'])
(null.loc[(null['NA']>0)])

In [None]:
dfs.dropna(axis=0, inplace =True)

In [None]:
dfs['outcome'] = dfs['outcome'].astype('category').cat.codes
dfs['Age'] = dfs['Age'].astype('category').cat.codes

In [None]:
#Separating features and target variable from both test and train dataset
xtrain = df.drop("outcome", axis=1)
ytrain = df["outcome"]
xtest  = dfs.drop("outcome", axis=1)
ytest  = dfs["outcome"]

In [None]:
xtrain.head()

# Data Analysis

In [None]:
#RandomForestClassifier Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=14 , n_jobs= -1)
from pprint import pprint
pprint(rf.get_params())

In [None]:
#RandomizedSearchCV for best hyperparameter range
from sklearn.model_selection import RandomizedSearchCV
n_est=[100,140,180,220]
bootstrap = [True,False]
msl = [2,3,4]
mss=[1,2,3]
mf=[.5,1,'auto']
criterion = ['gini', 'entropy']
maxd=[10,15,20,30,40,None]


In [None]:
random_grid = {'n_estimators': n_est,
               'max_features': mf,
               'max_depth': maxd,
               'min_samples_split': mss,
               'min_samples_leaf': msl,
               'bootstrap': bootstrap,
               'criterion' : criterion}

In [None]:
pprint(random_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=14, n_jobs = -1)

In [None]:
rf_random.fit(xtrain, ytrain)

In [None]:
rf_random.best_params_

In [None]:
#GridSearchCV for finding better hyperparameters 
from sklearn.model_selection import GridSearchCV
grid_param= {'n_estimators':[150, 180, 200,300,500],
             'min_samples_split':[2,4,6],
             'min_samples_leaf': [1,2,3,4],
             'max_features': [0.5, 'auto'],
             'max_depth': [10,15,20],
             'criterion': ['entropy'],
             'bootstrap': [True]}

In [None]:
rf = RandomForestClassifier()

In [None]:
grid_search =  GridSearchCV(estimator = rf, param_grid = grid_param, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(xtrain, ytrain)

In [None]:
grid_search.best_params_

In [None]:
#Importing Confusion Matrix to describe the performance of a classification model 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
#To calculate F-Measure
from sklearn.metrics import f1_score

In [None]:
#Random Forest Classifier Model
rf = RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_leaf=2,
                            min_samples_split=4,
                            max_depth=10)

In [None]:
rf.fit(xtrain, ytrain)
rf.score(xtrain, ytrain)

In [None]:
pred = rf.predict(xtest)
#Confusion Matrix for Random Forest
actual = ytest
predicted = pred
result = confusion_matrix(actual,predicted)

In [None]:
print ('Confusion Matrix :')
print(result) 
print('Accuracy Score :',accuracy_score(actual, predicted)) 
print ('Report : ')
print (classification_report(actual, predicted))

In [None]:
#Decision tree Classifier Model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth = 3)
dt.fit(xtrain, ytrain)

In [None]:
prediction = dt.predict(xtest)
#Confusion Matrix for Decision Tree
actual = ytest
predicted = prediction
result = confusion_matrix(actual,predicted)

In [None]:
print ('Confusion Matrix :')
print(result) 
print('Accuracy Score :',accuracy_score(actual, predicted)) 
print ('Report : ')
print (classification_report(actual, predicted) )

In [None]:
accuracy_dt= round(dt.score(xtrain, ytrain) * 100, 2)
accuracy_rf= round(rf.score(xtrain, ytrain) * 100, 2)
print("Train accuracy of Random Forest Classifier is ",accuracy_rf,"%")
print("Train accuracy of Decision Tree Classifier is ",accuracy_dt,"%")

#### If this notebook helped you in learning, please drop an upvote! :)