In [None]:
import numpy as np 
import pandas as pd 

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings  
warnings.filterwarnings('ignore')

Step 1 - Loading datasets

In [None]:
# Checking files
import os
os.listdir("../input/horse-colic-dataset")

In [None]:
# Loading
train = pd.read_csv("../input/horse-colic-dataset/horse.csv")
test = pd.read_csv("../input/horse-colic-dataset/horseTest.csv")

Stepe 2 - Exploring Data Analisys

In [None]:
# Checking dataset shape
print("--- Dataset Treino ---")
print("Variáveis:\t{}\nEntradas:\t{}\n".format(train.shape[1], train.shape[0]))

print("--- Dataset Teste ---")
print("Variáveis:\t{}\nEntradas:\t{}".format(test.shape[1], test.shape[0]))

In [None]:
# Checking dataset types
display(train.dtypes)
display(train.dtypes.value_counts())
display(train.head())

In [None]:
#Overview
train.describe()

In [None]:
# Checking missing values
qtde_nulos = train.isna().sum()

print(qtde_nulos)

plt.figure(figsize=(18,10))
plt.bar(range(len(qtde_nulos)), qtde_nulos)
plt.title('Missing Values x Features')
plt.xlabel('features')
plt.ylabel('missing')
plt.xticks(list(range(len(train.columns))), list(train.columns.values), rotation='vertical')
plt.show()

In [None]:
# Plot histogram
train.hist(figsize=(18,15));

In [None]:
# View target variable
sns.countplot(data=train, x='outcome');
print(train.outcome.value_counts())


In [None]:
sns.countplot(data=train, x='outcome', hue='pain');
plt.show()

#A vast majority of horses that suffer extreme or severe pain
#While most horses involved in euthanasia experienced severe or depressed pain.

In [None]:
g = sns.FacetGrid(data=train, col='outcome', margin_titles=True, height=6)
g.map(plt.hist, 'pulse')
plt.subplots_adjust(top=0.8)
g.fig.suptitle('Outcome por Pulso')

#Most of the horses that died had a pulse of approximately 80-100 bpm.

In [None]:
g = sns.catplot(data=train, x='peripheral_pulse', col='outcome', kind='count');
g.fig.suptitle('Outcome por Pulso Periférico');
plt.subplots_adjust(top=0.85)

#More than half of the horses that died or were euthanized had a reduced peripheral pulse.

In [None]:
reduced_absent_pulse = train[train.outcome.isin(('died','euthanized')) & train.peripheral_pulse.isin(('reduced','absent'))]

g = sns.catplot(data=reduced_absent_pulse, x='capillary_refill_time', col='outcome', kind='count');
g.fig.suptitle('Outcome por Tempo de Preenchimento Capilar');
plt.subplots_adjust(top=0.85)

#From all horses that died / were euthanized and had a reduced / absent peripheral pulse,
#the majority had a capillary filling time of more than 3 seconds.
#This is the sign of a bad circulatory system.

Etapa 3 - Tratamento dos dados

In [None]:
# Join datasets train and test

# save id
train_idx = train.shape[0]
test_idx = test.shape[0]

# join train and test 
df_merged = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

# before
print("train.shape: ({} x {})".format(train.shape[0], train.shape[1]))
print("test.shape: ({} x {})\n".format(test.shape[0], test.shape[1]))

# after join
print("df_merged.shape: ({} x {})".format(df_merged.shape[0], df_merged.shape[1]))


RULE >>> Removing and filling in missing values ​​in numerical and categorical data

For columns with more than 50% NAN value: remove columns

For columns with less than 50% NAN value:

For numeric data: Replaces the NAN values ​​with the median value for the specific column
For categorical data: Replaces the NAN values ​​with the mode value of the specific column

In [None]:
for col in df_merged.columns.values:
    
    if (pd.isna(df_merged[col]).sum()) > 0: 
    
        if pd.isna(df_merged[col]).sum() > (50/100 * len(df_merged)): 
            print(col,"removido") 
            df_merged = df_merged.drop([col], axis=1) 
        
        elif (df_merged[col].dtype == 'object'):
            df_merged[col] = df_merged[col].fillna(df_merged[col].mode()[0])        
        
        else:
            df_merged[col] = df_merged[col].fillna(df_merged[col].median())
            
                
print(df_merged.shape)
print(df_merged.isna().sum())

In [None]:
# Label Encoder to "outcome"

df_merged["outcome"] = df_merged["outcome"].astype('category').cat.codes
df_merged.head()

Inspecting the correlation between features and results

The correlation shows how strong the attributes are related to each other. We will check the correlation of each column with the result.

If the correlation value is positive, the feature is positively correlated to the result. If the correlation value is negative, the feature is negatively correlated to the result.

If the correlation value is 0, the two attributes are not correlated.

     | valor | > 0,7: positively correlated
     0,7 <| valor | > 0.3: normal correlated
     0,3 <| valor | > 0: not correlação

In [None]:
df_merged_corr = df_merged.corr()
corr_values = df_merged_corr["outcome"].sort_values(ascending=False)
corr_values = abs(corr_values).sort_values(ascending=False)

print("Correlated")
print(abs(corr_values).sort_values(ascending=False))

In [None]:
# Removing features where the correlation is practically nonexistent

df_merged = df_merged.drop(columns=['hospital_number'], axis=1)
df_merged = df_merged.drop(columns=['respiratory_rate'], axis=1)
df_merged = df_merged.drop(columns=['lesion_3'], axis=1)
df_merged = df_merged.drop(columns=['rectal_temp'], axis=1)

df_merged.head()

In [None]:
# Converting categorial data do numeric - One Hot Encoding
df_merged = pd.get_dummies(df_merged)
df_merged.head(10)

Etapa 4 - Aplicação dos Modelos de ML 

In [None]:
# Recovering datasets train and test
train = df_merged.iloc[:train_idx]
test = df_merged.iloc[train_idx:]

# Checking shape  
print("--- Dataset Train ---")
print("Variáveis:\t{}\nEntradas:\t{}\n".format(train.shape[1], train.shape[0]))

print("--- Dataset Test ---")
print("Variáveis:\t{}\nEntradas:\t{}".format(test.shape[1], test.shape[0]))

In [None]:
# Extract results (outcome) and removing at datasets to training of the models
X_train = train.drop("outcome", axis=1).values
Y_train = train["outcome"]
X_test  = test.drop("outcome", axis=1).values
Y_test  = test["outcome"]

In [None]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=150, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
random_forest.fit(X_train, Y_train)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print('Acurácia do modelo RandomForestClassifier:',acc_random_forest,"\n")

Y_pred1 = random_forest.predict(X_test)

# Confusion Matrix 
print(pd.crosstab(Y_test,Y_pred1,
                  rownames=["Real"], 
                  colnames=["Predict"], 
                  margins=True))

In [None]:
# Decision Tree
decision_tree = DecisionTreeClassifier(max_depth = 3)
decision_tree.fit(X_train, Y_train)

Y_pred2 = decision_tree.predict(X_test)

acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print('Acurácia do modelo DecisionTreeClassifier:',acc_decision_tree, "\n")

# Confusion Matrix 
print(pd.crosstab(Y_test,Y_pred2,
                  rownames=["Real"], 
                  colnames=["Predict"], 
                  margins=True))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)

Y_pred3 = knn.predict(X_test)

acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
print('Acurácia do modelo KNeighborsClassifier:',acc_knn, "\n")

# Confusion Matrix 
print(pd.crosstab(Y_test,Y_pred3,
                  rownames=["Real"], 
                  colnames=["Predict"], 
                  margins=True))

In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
print('Acurácia do modelo LogisticRegression:',acc_log, "\n")

Y_pred4 = logreg.predict(X_test)

# Confusion Matrix 
print(pd.crosstab(Y_test,Y_pred4,
                  rownames=["Real"], 
                  colnames=["Predict"], 
                  margins=True))


In [None]:
# Final Ranking 
results = pd.DataFrame({
    'Model': ['Random Forest','Logistic Regression','KNN','Decision Tree'],
    'Score': [acc_random_forest, acc_log, acc_knn, acc_decision_tree]})
    
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)