In [10]:
#Import Libraries
import numpy as np
import pandas as pd
import os
import math

In [11]:
#Loading the data
input_dir = os.path.join(os.pardir, 'data')
HCC_score_df = pd.read_csv(os.path.join(input_dir, 'HCC_Code_Score.csv'), nrows=None).reset_index(drop = True)
Diagnosis_code_HCC = pd.read_csv(os.path.join(input_dir, 'Diagnosis_code_HCC.csv'), nrows=None).reset_index(drop = True)
data = pd.read_csv(os.path.join(input_dir, 'data.csv'), nrows=None).reset_index(drop = True)
Age_Gender_Score = pd.read_csv(os.path.join(input_dir, 'Age_Gender_Score.csv'), nrows=None).reset_index(drop = True)

In [12]:
data.head()

Unnamed: 0,Age,Gender,Height (cms),Weight (Kgs),Service Code,Revenue Code,Diagnosis Code,Smoking,Alcohol,Prescribed Drugs,Geography,Family History,Total No. of Readmission
0,38,Female,161,79,99202,,E11.62,Yes,Yes,Metformin,Massachusets,Yes,0
1,37,Female,155,81,S9131,421.0,E11.9,Yes,Yes,Metformin,Massachusets,No,2
2,61,Male,182,105,99213,,E11.8,No,No,Metformin,Massachusets,Yes,0
3,74,Male,177,81,99213,,E11.63,Yes,Yes,Metformin,Massachusets,No,1
4,44,Male,174,76,92226,761.0,E11.3513,Yes,Yes,Metformin,Massachusets,No,0


In [13]:
#Create 'Age_bucket' variable in data based on 'Age' provided
data.loc[data['Age']<=34,'Age_bucket'] = '0-34 Years'
data.loc[((data['Age']>34) & (data['Age']<=44)),'Age_bucket'] = '35-44 Years'
data.loc[((data['Age']>44) & (data['Age']<=54)),'Age_bucket'] = '45-54 Years'
data.loc[((data['Age']>54) & (data['Age']<=59)),'Age_bucket'] = '55-59 Years'
data.loc[((data['Age']>59) & (data['Age']<=64)),'Age_bucket'] = '60-64 Years'
data.loc[(data['Age'] == 65),'Age_bucket'] = '65 Years'
data.loc[(data['Age'] == 66),'Age_bucket'] = '66 Years'
data.loc[(data['Age'] == 67),'Age_bucket'] = '67 Years'
data.loc[(data['Age'] == 68),'Age_bucket'] = '68 Years'
data.loc[(data['Age'] == 69),'Age_bucket'] = '69 Years'
data.loc[((data['Age']>69) & (data['Age']<=74)),'Age_bucket'] = '70-74 Years'
data.loc[((data['Age']>74) & (data['Age']<=79)),'Age_bucket'] = '75-79 Years'
data.loc[((data['Age']>79) & (data['Age']<=84)),'Age_bucket'] = '80-84 Years'
data.loc[((data['Age']>84) & (data['Age']<=89)),'Age_bucket'] = '85-89 Years'
data.loc[((data['Age']>89) & (data['Age']<=94)),'Age_bucket'] = '90-94 Years'
data.loc[(data['Age']>95),'Age_bucket'] = '95 Years or Over'

In [14]:
#Converting the 'Height' in mts (cm to m)
data['Height'] = data['Height (cms)'] *0.01
#Creating variable 'BMI', where BMI = weight (kg) ÷ height2 (m2)
data['BMI'] = (data['Weight (Kgs)'])/((data['Height'])**2)
data['BMI'] = data['BMI'].round(1)
#delete the temporary column 'Height' that is in m
del data['Height']
#Create 'BMI_Status' variable in data based on 'BMI' provided
data.loc[data['BMI']>=30,'BMI Status'] = 'Obese'
data.loc[data['BMI']< 25,'BMI Status'] = 'Underweight'
data.loc[((data['BMI']>25) & (data['BMI']< 30)),'BMI Status'] = '0verweight'
#Creating variable HCC_Code with BMI to find HCC Score on the basis of BMI
data.loc[(data['BMI Status'] == 'Obese'),'HCC_Code with BMI'] = 'HCC22'
data.loc[(data['BMI Status'] == '0verweight'),'HCC_Code with BMI'] = 'HCC23'
#Fetch 'HCC Score' corresponding to 'HCC Code' Respective to BMI status
data = data.merge(HCC_score_df, how='inner', right_on = 'HCC_Code', left_on = 'HCC_Code with BMI')
#rename the 'HCC_Score' column with 'HCC_Score With BMI Status'
data =data.rename(columns={'HCC_Score':'HCC_Score With BMI Status'})
#delete the temporary columns 'HCC_code' and 'HCC_Code with BMI'
del data['HCC_Code']
del data['HCC_Code with BMI']

In [15]:
#Fetch Risk Score corresponding to 'Age' and 'Gender'
age_score_df = data.merge(Age_Gender_Score, how='inner', on = ['Age_bucket','Gender'])

#Fetch Risk Score corresponding to 'Diagnosis code'
Diagnosis_score_df = Diagnosis_code_HCC.merge(HCC_score_df, how='inner', on = ['HCC_Code'])
final_df = age_score_df.merge(Diagnosis_score_df, how='inner', right_on = 'Diagnosis_Code', left_on = 'Diagnosis Code')

#Encode 'Smoking', 'Alcohol' and 'Family History' variables for final risk score computation
final_df['Smoking'] = final_df['Smoking'].apply(lambda x: 1 if x == 'Yes' else 0)
final_df['Alcohol'] = final_df['Alcohol'].apply(lambda x: 1 if x == 'Yes' else 0)
final_df['Family History'] = final_df['Family History'].apply(lambda x: 1 if x == 'Yes' else 0)

#For Risk_Score computation we are considering following factors:
#'Smoking','Alcohol','Family History','Total No. of Readmission','Age_Gender_Score','HCC_Score','Height (cms)','Weight (Kgs)','BMI','HCC_Score With BMI Status'
final_df = final_df[['Age','Gender','Diagnosis Code','Smoking','Alcohol','Family History','Total No. of Readmission','Height (cms)','Weight (Kgs)','BMI','Age_Gender_Score','HCC_Score','HCC_Score With BMI Status']]
#calculating the new dimension 'Risk_Score'
final_df['Risk_Score'] = (10*final_df['Smoking']) + (10*final_df['Alcohol']) + (20*final_df['Family History']) + (20*final_df['Total No. of Readmission']) + (20*final_df['Age_Gender_Score']) + (10*final_df['HCC_Score']) + (10*final_df['HCC_Score With BMI Status'])
#Normalizing the 'Risk Score' Column
final_df['Risk_Score']=(final_df['Risk_Score']-final_df['Risk_Score'].min())/(final_df['Risk_Score'].max()-final_df['Risk_Score'].min())
#Categorizing the 'Risk_Score' to low, medium,high
final_df.loc[final_df['Risk_Score'] <= 0.3, 'Risk_Score_Category'] = 'low'
final_df.loc[((final_df['Risk_Score'] > 0.3) & (final_df['Risk_Score'] < 0.65)) , 'Risk_Score_Category'] = 'medium'
final_df.loc[final_df['Risk_Score'] > 0.65, 'Risk_Score_Category'] = 'high'
print(final_df.sort_values('Risk_Score').reset_index(drop = True))


   Age  Gender Diagnosis Code  Smoking  Alcohol  Family History  \
0   52    Male          E11.8        0        0               0   
1   44    Male       E11.3513        1        1               0   
2   61    Male          E11.8        0        0               1   
3   79    Male          E11.5        0        0               1   
4   59    Male          E11.9        0        1               0   
5   49    Male          E11.9        1        1               0   
6   74    Male         E11.63        1        1               0   
7   38  Female         E11.62        1        1               1   
8   37  Female          E11.9        1        1               0   
9   72    Male         E11.63        1        1               1   

   Total No. of Readmission  Height (cms)  Weight (Kgs)   BMI  \
0                         0           169            89  31.2   
1                         0           174            76  25.1   
2                         0           182           105  31.7   
3 

In [33]:
final_df = final_df.sort_values('Risk_Score').reset_index(drop = True)

In [34]:
final_df

Unnamed: 0,Age,Gender,Diagnosis Code,Smoking,Alcohol,Family History,Total No. of Readmission,Height (cms),Weight (Kgs),BMI,Age_Gender_Score,HCC_Score,HCC_Score With BMI Status,Risk_Score,Risk_Score_Category
0,52,Male,E11.8,0,0,0,0,169,89,31.2,0.834,0.318,0.273,0.0,low
1,44,Male,E11.3513,1,1,0,0,174,76,25.1,0.665,0.318,0.228,0.163598,low
2,61,Male,E11.8,0,0,1,0,182,105,31.7,0.923,0.318,0.273,0.220356,low
3,79,Male,E11.5,0,0,1,0,177,102,32.6,1.04,0.318,0.273,0.244031,low
4,59,Male,E11.9,0,1,0,1,183,98,29.3,0.889,0.104,0.228,0.288446,low
5,49,Male,E11.9,1,1,0,1,171,74,25.3,0.834,0.104,0.228,0.37849,medium
6,74,Male,E11.63,1,1,0,1,177,81,25.9,0.776,0.318,0.228,0.388406,medium
7,38,Female,E11.62,1,1,1,0,161,79,30.5,0.936,0.318,0.273,0.425334,medium
8,37,Female,E11.9,1,1,0,2,155,81,33.7,0.936,0.104,0.273,0.60603,medium
9,72,Male,E11.63,1,1,1,3,176,97,31.3,0.776,0.318,0.273,1.0,high
