In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statistics

## Loading Data

In [2]:
#Load train data
train_data = pd.read_csv('data/train_data.csv')
train_data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


## Data Pre-Processing

In [3]:
#Data Types of Different Columns
train_data.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [4]:
train_data.describe()

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Admission_Deposit
count,318438.0,318438.0,318438.0,318438.0,318325.0,318438.0,313906.0,318438.0,318438.0
mean,159219.5,18.318841,4.771717,3.197627,2.625807,65747.579472,7.251859,3.284099,4880.749392
std,91925.276848,8.633755,3.102535,1.168171,0.873146,37979.93644,4.745266,1.764061,1086.776254
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1800.0
25%,79610.25,11.0,2.0,2.0,2.0,32847.0,4.0,2.0,4186.0
50%,159219.5,19.0,5.0,3.0,3.0,65724.5,8.0,3.0,4741.0
75%,238828.75,26.0,7.0,4.0,3.0,98470.0,8.0,4.0,5409.0
max,318438.0,32.0,13.0,24.0,4.0,131624.0,38.0,32.0,11008.0


In [5]:
#Searching for null values
train_data.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [6]:
#Fill null values with 0's
train_data['Bed Grade'] = train_data['Bed Grade'].fillna(0)
train_data['City_Code_Patient'] = train_data['City_Code_Patient'].fillna(0)

In [7]:
#Confirmation that there are no more null values
train_data.isna().sum()

case_id                              0
Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [8]:
#Dropping irrelevant columns
train_data.drop(['case_id', 'patientid'], axis=1, inplace=True)

In [9]:
#Exploring the Stay Column
train_data['Stay'].value_counts()

21-30                 87491
11-20                 78139
31-40                 55159
51-60                 35018
0-10                  23604
41-50                 11743
71-80                 10254
More than 100 Days     6683
81-90                  4838
91-100                 2765
61-70                  2744
Name: Stay, dtype: int64

In [10]:
#Replace the more than 100 days category to something more uniform
train_data['Stay'].replace('More than 100 Days', '100+', inplace=True)

In [11]:
train_data['Stay'].value_counts()

21-30     87491
11-20     78139
31-40     55159
51-60     35018
0-10      23604
41-50     11743
71-80     10254
100+       6683
81-90      4838
91-100     2765
61-70      2744
Name: Stay, dtype: int64

## Feature Engineering

In [12]:
#Divide columns into categorical and numerical
categorical_columns = []
numerical_columns = []

for column in train_data.columns:
    if train_data[column].dtypes == 'object':
        categorical_columns.append(column)
        
for column in train_data.columns:
    if train_data[column].dtypes != 'object':
        numerical_columns.append(column)
        
print(categorical_columns)
print(numerical_columns)

['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age', 'Stay']
['Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital', 'Bed Grade', 'City_Code_Patient', 'Visitors with Patient', 'Admission_Deposit']


In [13]:
#Some columns in the numerical columns are list are in reality more like categorical variables
categorical_columns.append('Bed Grade')
categorical_columns.append('Hospital_code')
categorical_columns.append('City_Code_Hospital')
categorical_columns.append('City_Code_Patient')

In [14]:
#Remove the above columns from the numerical columns list
numerical_columns.remove('Bed Grade')
numerical_columns.remove('Hospital_code')
numerical_columns.remove('City_Code_Hospital')
numerical_columns.remove("City_Code_Patient")
numerical_columns

['Available Extra Rooms in Hospital',
 'Visitors with Patient',
 'Admission_Deposit']

In [15]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])

In [16]:
train_data[categorical_columns]

Unnamed: 0,Hospital_type_code,Hospital_region_code,Department,Ward_Type,Ward_Facility_Code,Type of Admission,Severity of Illness,Age,Stay,Bed Grade,Hospital_code,City_Code_Hospital,City_Code_Patient
0,2,2,3,2,5,0,0,5,0,2,7,2,7
1,2,2,3,3,5,1,0,5,5,2,1,4,7
2,4,0,1,3,4,1,0,5,4,2,9,0,7
3,1,1,3,2,3,1,0,5,5,2,25,1,7
4,1,1,3,3,3,1,0,5,5,2,25,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,0,0,3,1,5,0,2,4,2,4,5,5,22
318434,0,0,1,1,4,2,2,8,4,4,23,0,8
318435,0,0,2,2,5,0,1,7,2,4,6,3,10
318436,1,1,1,1,3,1,1,1,2,3,10,1,8


In [17]:
#Scaling of numerical columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns].values)

In [18]:
train_data.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,7,2,2,2,-0.169177,3,2,5,2,7,0,0,-0.727923,5,0.027835,0
1,1,2,4,2,-1.025217,3,3,5,2,7,1,0,-0.727923,5,0.987556,5
2,9,4,0,0,-1.025217,1,3,4,2,7,1,0,-0.727923,5,-0.12491,4
3,25,1,1,1,-1.025217,3,2,3,2,7,1,0,-0.727923,5,2.200319,5
4,25,1,1,1,-1.025217,3,3,3,2,7,1,0,-0.727923,5,0.623175,5


In [19]:
#Declaring x and y variables
y = train_data['Stay']
X = train_data.drop('Stay', axis=1)

## Logistic Regression Model

In [20]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(max_iter=5000)
lr_classifier

LogisticRegression(max_iter=5000)

In [21]:
lr_classifier.fit(X, y)

LogisticRegression(max_iter=5000)

In [22]:
lr_classifier.score(X, y)

0.378689101175111