# Task for Today  

***

## Horse Survival Prediction  

Given *medical data about horses*, let's try to predict whether a given horse will **survive** or not.  
  
We will use a descision tree classifier and a random forest classifier to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv('../input/horse-colic/horse.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
X['mucous_membrane']

In [None]:
def binary_encode(df, columns, positive_values):
    df = df.copy()
    for column, positive_value in zip(columns, positive_values):
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Categorize categorical features
    binary_features = [
        'surgery',
        'age',
        'surgical_lesion',
        'cp_data'
    ]
    positive_values = [
        'yes',
        'adult',
        'yes',
        'yes'
    ]
    
    ordinal_features = [
        'temp_of_extremities',
        'peripheral_pulse',
        'capillary_refill_time',
        'pain',
        'peristalsis',
        'abdominal_distention',
        'nasogastric_tube',
        'nasogastric_reflux',
        'rectal_exam_feces'    
    ]
    orderings = [
        ['cold', 'cool', 'normal', 'warm'],
        ['absent', 'reduced', 'normal', 'increased'],
        ['less_3_sec', '3', 'more_3_sec'],
        ['alert', 'depressed', 'mild_pain', 'severe_pain', 'extreme_pain'],
        ['absent', 'hypomotile', 'normal', 'hypermotile'],
        ['none', 'slight', 'moderate', 'severe'],
        ['none', 'slight', 'significant'],
        ['none', 'less_1_liter', 'more_1_liter'],
        ['absent', 'decreased', 'normal', 'increased']
    ]
    
    nominal_features = [
        'hospital_number',
        'mucous_membrane',
        'abdomen',
        'abdomo_appearance'
    ]
    prefixes = [
        'HN',
        'MM',
        'AB',
        'AA'
    ]
    
    # Fill missing values
    for column in df.columns:
        if column in df.select_dtypes('object').columns:
            if column not in nominal_features:
                df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].mean())
    
    # Encode categorical feature columns
    df = binary_encode(df, columns=binary_features, positive_values=positive_values)
    df = ordinal_encode(df, columns=ordinal_features, orderings=orderings)
    df = onehot_encode(df, columns=nominal_features, prefixes=prefixes)
    
    # Encode labels
    label_mapping = {'lived': 0, 'died': 1, 'euthanized': 2}
    df['outcome'] = df['outcome'].replace(label_mapping)
    
    # Split df into X and y
    y = df['outcome'].copy()
    X = df.drop('outcome', axis=1).copy()
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y

In [None]:
X, y = preprocess_inputs(data)

In [None]:
{column: list(X[column].unique()) for column in X.select_dtypes('object').columns}

In [None]:
X

In [None]:
y.value_counts()

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

print("Decision Tree Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

In [None]:
ensemble_model = RandomForestClassifier()
ensemble_model.fit(X_train, y_train)

print("Random Forest Accuracy: {:.2f}%".format(ensemble_model.score(X_test, y_test) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/oXUDU101e2c