In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf


In [None]:
train = pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/train_data.csv')
train.head()

In [None]:
train.info()
#rows : 318438
#columns : 18

DATA CLEANING

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#There are null values in city_code_patient column and its better to drop that column to get accurate results.

Infomation of attributes in the Dataset
0   case_id	                 Case_ID registered in Hospital
1	Hospital_code	         Unique code for the Hospital
2	Hospital_type_code	     Unique code for the type of Hospital
3	City_Code_Hospital	     City Code of the Hospital
4	Hospital_region_code     Region Code of the Hospital
5	Available Extra Rooms in Hospital	Number of Extra rooms available in the Hospital
6	Department	             Department overlooking the case
7	Ward_Type	             Code for the Ward type
8	Ward_Facility_Code	     Code for the Ward Facility
9	Bed Grade	             Condition of Bed in the Ward
10	patientid	             Unique Patient Id
11	City_Code_Patient	     City Code for the patient
12	Type of Admission	     Admission Type registered by the Hospital
13	Severity of Illness	     Severity of the illness recorded at the time o...
14	Visitors with Patient	 Number of Visitors with the patient
15	Age	                     Age of the patient
16	Admission_Deposit	     Deposit at the Admission Time
17	Stay	                 Stay Days by the patient

DATA VISUALIZATION

In [None]:
train['Ward_Type'].unique()

In [None]:
def change(ch):
    if(ch=='R'):
        return 0
    elif(ch=='S'):
        return 1
    elif(ch=='Q'):
        return 2
    elif(ch=='P'):
        return 3
    elif(ch=='T'):
        return 4
    elif(ch=='U'):
        return 5

In [None]:
train['Ward_Type']=train['Ward_Type'].apply(change)
#Only numerical data will be accpeted by machine learning model

In [None]:
#Drop the columns having null value
train.drop('City_Code_Patient',axis=1,inplace=True)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#now no null values

In [None]:
train.groupby('patientid')['Bed Grade'].mean().sort_values(ascending=False).head()

The top most average rating were of 4 out of 5.

In [None]:
train.groupby('Bed Grade')['Bed Grade'].count().sort_values(ascending=False).head()

Even though there were 4 rated beds a lot of them voted 2 rated money(this could be a possible reason of unemployment problem).

In [None]:
sns.countplot(train['Type of Admission'],hue=train['Department'],palette='rainbow')

Most of the patients who were admitted for either in the emergency or trauma or Urgent catagory had an appointment in gynecology department
.There were very few admissions for surgery.

In [None]:
train['Age'].hist(bins=20,figsize=(10,4))

Cluster of number of patients could be seen in the interval between 31-40
.There were fewer(outliers)patients admitted of age 91-100.


In [None]:
def change1(ch):
    if(ch=='Extreme'):
        return 0
    elif(ch=='Minor'):
        return 1
    elif(ch=='Moderate'):
        return 2

In [None]:
train['Severity of Illness'] = train['Severity of Illness'].apply(change1)

In [None]:
sns.countplot(train['Severity of Illness'],hue=train['Department'],palette='rainbow')

There were good number of serious cases in radiotherapy department
.Eventhough there were more number of admissions in gynec department most of them were moderate and only a few were of extreme and minor cases.

In [None]:
train['Admission_Deposit'].hist(bins=100,figsize=(10,4))

It seems to be a normalized curve and deposit amount were charged between 2000 and 8000
.Only few of them had to deposit amout of 100000

In [None]:
train.groupby('Severity of Illness')['Admission_Deposit'].mean().sort_values(ascending=False)

Money charged based on severity of illness
,1-Denotes Minor case
,2-Denotes Moderate
,3-Denotes Extreme
,For minor cases average deposit is higher compared to other types of illness.



In [None]:
train['Stay'].hist(bins=20,figsize=(10,4))

Many patients stayed for about 21-30 days
.There were only few patients who stayed more than 100 days

In [None]:
train.drop(['Ward_Facility_Code','Hospital_region_code','Hospital_type_code'],axis=1,inplace=True)

In [None]:
train.drop('Age',inplace=True,axis=1)

In [None]:
def change3(ch):
    if ch=='radiotherapy':
        return 0
    elif ch== 'anesthesia':
        return 1
    elif ch=='gynecology':
        return 2
    elif ch== 'TB & Chest disease':
        return 3
    elif ch== 'surgery':
        return 4

In [None]:
train['Department'] = train['Department'].apply(change3)

In [None]:
def change4(ch):
    if ch=='Emergency':
        return 0
    elif ch=='Trauma':
        return 1
    elif ch =='Urgent':
        return 2

In [None]:
train['Type of Admission'] = train['Type of Admission'].apply(change4)

In [None]:
#Data type of stay is converted to numbers[0-9]
train['Stay']=train.Stay.astype("category").cat.codes
#number is assinged(ascending)

In [None]:
train['Stay'].unique()
#Totally ten different categories

PREDICTION OF STAY OF PATIENTS IN THE HOSPITAL

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train.drop(['City_Code_Hospital','Visitors with Patient'],inplace=True,axis=1)

In [None]:
train.drop('Bed Grade',inplace=True,axis=1)

In [None]:
#Train data and test data(to predict number of stay)
X = train.drop('Stay',axis=1,inplace=False)
Y = train['Stay']


In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kn = KNeighborsClassifier(n_neighbors=2)

In [None]:
kn.fit(X_train,Y_train)

In [None]:
print(kn.score(X_train,Y_train)*100)

Accuracy of train data is about : 61 percentage

TESTING THE TRAINED MODEL FOR TEST DATA

In [None]:
test = pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/test_data.csv')

In [None]:
test

In [None]:
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#city_code_patient contains null values so we have to remove it

In [None]:
test.drop(['Hospital_type_code','City_Code_Hospital','Hospital_region_code','Ward_Facility_Code','Bed Grade','City_Code_Patient','Visitors with Patient','Age'],inplace=True,axis=1)

In [None]:
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Test data is processed so that unnecessary columns are removed and also null values are dropped which might affect accuracy.

In [None]:
test['Ward_Type']=test['Ward_Type'].apply(change)
test['Severity of Illness'] = test['Severity of Illness'].apply(change1)
test['Department'] = test['Department'].apply(change3)
test['Type of Admission'] = test['Type of Admission'].apply(change4)

In [None]:
test
#All columns are numeric and it is ready to be fed into algorithm.

In [None]:
#Prediction of values for the stay
predict1 = kn.predict(test)

In [None]:
test['predict'] = predict1

In [None]:
key_value={
0:'0-10',
1:'11-20',
2:'21-30',
3:'31-40',
4:'41-50',
5:'51-60',
6:'61-70',
7:'71-80',
8:'81-90',
8:'91-100'
}
test['value'] = test.predict.replace(key_value)

In [None]:
values_arr = np.array(test['case_id'])
predict = np.array(test['value'])

In [None]:
#Final DataFrame
df = pd.DataFrame(data=[values_arr,predict],index=['case_id','Stay'])

THE FINAL DATAFRAME ,which consists of case_id and the Length of Stay of each of the patients.
.This model has overall accuracy of 61%.

In [None]:
df

Drop comments if any!!