In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn.model_selection import train_test_split
data = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv")
data

## Our Task
Our goal is to predict how long a patient will stay in hospital based on some infomation collected on admission


## Different Types of Machine Learning
We are using historical data which has labels with the outcome we're trying to predict, this is called Supervised Machine Learning. There are other types of machine learning for building models without these labels. 

## Training Data
We will build a model using training data which includes both the infomation collected on admission and the 
stay length which we would like to be able to predict. We will use some of our data for building the model and some for evaluating its performance.


In [None]:
# We will use some of the data for building a model, and some for evaluating the model
[training_data, testing_data] = train_test_split(data)
training_data = pd.DataFrame(training_data)
testing_data = pd.DataFrame(testing_data)


## Our Training Data

## Making Values Numeric
I convert stay to a numeric value to make is possible to calculate the correlation, also age and severity




In [None]:
# 
def stayToNumber(stay):
    return {
        '0-10': 5,
        '11-20': 15,
        '21-30': 25,
        '31-40': 35,
        '41-50': 45,
        '51-60': 55,
        '61-70': 65,
        '71-80': 75,
        '81-90': 85,
        '91-100': 95,
        'More than 100 Days': 101
        
    }[stay]

def ageToNumber(age):
    return {
    '0-10': 5,
    '11-20': 15,
    '21-30': 25,
    '31-40': 35,
    '41-50': 45,
    '51-60': 55,
    '61-70': 65,
    '71-80': 75,
    '81-90': 85,
    '91-100': 95
}[age]

def severityToNumber(severity):
    return {
        'Extreme': 3,
        'Moderate': 2,
        'Minor': 1,
    }[severity]

training_data["Stay"] = training_data["Stay"].apply(stayToNumber)
training_data["Age"] = training_data["Age"].apply(ageToNumber)
training_data["Severity"] = training_data["Severity of Illness"].apply(severityToNumber)
testing_data["Stay"] = testing_data["Stay"].apply(stayToNumber)
testing_data["Age"] = testing_data["Age"].apply(ageToNumber)
testing_data["Severity"] = testing_data["Severity of Illness"].apply(severityToNumber)

In [None]:
# Here I'm ignoring some of the features to make a smaller example
training_data = training_data.drop(['case_id', 'Ward_Type','Admission_Deposit','City_Code_Hospital', 'Hospital_code', 'Hospital_type_code', 'Hospital_region_code', 'Ward_Facility_Code', 'City_Code_Patient', 'Visitors with Patient', 'Bed Grade',  'Available Extra Rooms in Hospital'], axis=1)

## Data Exploration
I'm going to draw some histograms to get a better idea of the data we're working with.

It's important that your training data is representative of the real world. For example we could not build a model which would work for all ages if we only have data for 20 year olds. Collecting an appropriate data set for training can be one of the biggest challenges when building models.

In [None]:
training_data.drop(['patientid'], axis=1).hist()

## Feature Selection
We want to choose which 'features' of the data we will choose to train the model. To help I've made a chart of some of the correlations we see.

Patient Id has a very low correlation because it's random







In [None]:
from matplotlib.pyplot import bar
import matplotlib.pyplot as plt
axes = plt.axes()
axes.set_ylim([0, 1])
trainingDataCorrelation = training_data.corr()["Stay"]
bar(trainingDataCorrelation.drop(["Stay"]).sort_values().index, trainingDataCorrelation.drop(["Stay"]).sort_values())


## Let's take a closer look at this correlation
I graph Age vs Stay Length, adding a little jitter so we can see it better


In [None]:
from matplotlib.pyplot import scatter
scatter(training_data["Age"].apply(lambda x: x + np.random.randn(1)*0.5), training_data["Stay"].apply(lambda x: x + np.random.randn(1)*0.5), alpha=0.1)

![I don't trust linear regressions when it's harder to guess the direction of the correlation from the scatter plot than to find new constellations on it.](https://imgs.xkcd.com/comics/linear_regression.png )

# Linear Regression
One type of machine learning model, we fit a line to our data and can then make predictions using the line.

Here using a survey results asking people to rate apples and also apples pies on a scale of 0-50, we can build a model to predict how much someone likes apples pies based on how much they like apples.
![](https://i.ibb.co/nQ4KmZB/Apple-Pies.png)

## Building Models to Evalulate
We're going to build two models, one which uses Age to predict stay length and the other which uses Severity

In [None]:
from sklearn.linear_model import LinearRegression
# A Linear Regression model will attempt to 'learn' the correlation in the training data and then use this to make predictions
# We take a sample to train the model as trying to use the whole dataset led to memory usage issues
training_data_sample = training_data
predictByAge = LinearRegression().fit(training_data_sample[["Age"]].values,training_data_sample[["Stay"]].values)
predictBySeverity = LinearRegression().fit(training_data_sample[["Severity"]].values,training_data_sample[["Stay"]].values)

# The Models
Here I'm drawn a chart of the inputs and output for each model

In [None]:
from matplotlib.pyplot import plot


import numpy as np
import matplotlib.pyplot as plt

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.set_ylabel('Predicted Stay Length (Days)')
ax1.set_title('Prediction by Age')

ax1.scatter([20,30,40,50,60,70,80,90], predictByAge.predict([[20],[30],[40],[50],[60],[70],[80],[90]]).flatten())
ax1.set_xlabel('Age')
    
fig = plt.figure()
ax2.set_title('Prediction by Severity')
ax2.set_ylabel('Predicted Stay Length (Days)')

ax2.scatter([1,2,3], predictBySeverity.predict([[1],[2],[3]]).flatten())
ax2.set_xlabel('Severity')


## Calculating Error
I'm going to test each model using the testing data, the data we didn't use to train our model.

There are different ways to calculate the error of our model, for this example I'm going to calculate the average of the absolute difference between the actual stay and the stay our model predicted

In [None]:
def calculateError(classifier, classifierInput):
    error = (classifier.predict(classifierInput).flatten()  - testing_data["Stay"]).abs().describe()[["mean", "std"]]
    return "Mean Error is " +  str(round(error["mean"], 2)) + " with standard deviation of " + str(round(error["std"], 2))

{"Age Classifer Error": calculateError(predictByAge, testing_data[["Age"]]),
"Severity Error": calculateError(predictBySeverity, testing_data[["Severity"]])}

# What Next?
# We can experiment with different approaches to try to make a better model

## Improving the model
Now we're going to try to improve the model, when developing models it's best to try different approaches, evaluate them and try to improve iterativly

We build a model that uses both Age and Severity as an input

As there are now 3 dimensions I make a scatter plot of the model outputs using color to show severity



In [None]:
predictByAgeAndSeverity = LinearRegression().fit(training_data_sample[["Age", "Severity"]].values,training_data_sample[["Stay"]].values)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from itertools import product

df = pd.DataFrame(list(product([20,30,40,50,60,70,80,90], [0,1,2])), columns=["Age", "Severity"])
df["Prediction"] = df.apply(lambda x:   predictByAgeAndSeverity.predict([x])[0][0], axis=1)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(df["Age"], df["Severity"], c=df["Prediction"] ,cmap='YlOrRd')
ax.set_xlabel('Age')
ax.set_ylabel('Severity')



## Comparing the classifiers
We reduced the error by 0.1! 

In [None]:
{"Age Classifer Error": calculateError(predictByAge, testing_data[["Age"]]),
"Severity Error": calculateError(predictBySeverity, testing_data[["Severity"]]),
"Age & Severity Error": calculateError(predictByAgeAndSeverity, testing_data[["Age", "Severity"]]) }


![](https://media.giphy.com/media/3KC2jD2QcBOSc/giphy.gif)

# Further Reading
Hands on Machine Learning
https://library-search.imperial.ac.uk/permalink/f/tlmp2g/44IMP_ALMA_DS51118410830001591


