### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Reading Files

In [None]:
df_bill_amt = pd.read_csv("../input/sukouz-equnix/bill_amount.csv")
df_demographics = pd.read_csv("../input/sukouz-equnix/demographics.csv")
df_bill_id = pd.read_csv("../input/sukouz-equnix/bill_id.csv")
df_clinical_data = pd.read_csv("../input/sukouz-equnix/clinical_data.csv")

## Quick look at the files

In [None]:
df_bill_amt.head()

In [None]:
df_demographics.head()

In [None]:
df_bill_id.head()

In [None]:
df_clinical_data.head()

## Quick look at the target 

In [None]:
df_bill_amt.amount.hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)   # target is skewed

In [None]:
df_bill_id.count()

In [None]:
df_bill_id.patient_id.nunique()

In [None]:
df_clinical_data.head()

In [None]:
df_clinical_data.id.nunique()

In [None]:
df_clinical_data.shape

## Checking if All ids Are present in All files

In [None]:
set(df_clinical_data.id) ==  set(df_bill_id.patient_id) 

In [None]:
set(df_bill_id.bill_id) == set(df_bill_amt.bill_id) 

In [None]:
set(df_demographics.patient_id) == set(df_bill_id.patient_id)  

## Renaming "id"  to "patient_id"

In [None]:
df_clinical_data.rename(columns = {'id':'patient_id'}, inplace = True)

## Preparing flat file by merging the files

In [None]:
df_bill_clinical = pd.merge(df_bill_id,  df_clinical_data, 
                            on = ['patient_id', 'date_of_admission'], how = 'inner')

df_bill_cinical_amt = pd.merge(df_bill_clinical, df_bill_amt, on = 'bill_id', how = 'inner' )

df_final = pd.merge(df_bill_cinical_amt, df_demographics, on = 'patient_id', how = 'inner')

### Converting date time columns

In [None]:
df_final.date_of_admission = pd.to_datetime(df_final.date_of_admission, format = "%Y-%m-%d")
df_final.date_of_discharge = pd.to_datetime(df_final.date_of_discharge, format = "%Y-%m-%d")
df_final.date_of_birth = pd.to_datetime(df_final.date_of_birth, format = "%Y-%m-%d")

In [None]:
df_final.count()   

In [None]:
df_final.isna().sum()[df_final.isna().sum() != 0]

## Columns Having Missing Values

####  Looking at the above counts we can identify the columns which are having misssing values

In [None]:
list_missing_data_columns = ["medical_history_2", "medical_history_5"] # needs to be looked sepeartely

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

## Quick EDA

In [None]:
## If you Re Run the this cell after data cleaning (after the last cell) it will plot the clean results. 
##ie. We can re run after df_final complete 

list_sepearte_columns = []   

count = 0

for i in df_final.columns[2:]:
    C = df_final[i].nunique()
    #print(i + "  "+ str(C))
    
    if(C < 20):
        (df_final[i].value_counts()*100/len(df_final[i])).plot(kind = 'bar')
        plt.title(i + " (Count Plot)")
        plt.ylabel("Frequency (%)")
        plt.xlabel("Unique Values")
        plt.show()
        count = count+1
    else:
        if(df_final[i].dtype != 'O'):
            df_final[i].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)
            plt.ylabel("Frequency (count)")
            plt.xlabel(i)
            plt.title("Histogram")
            plt.show()
            count = count+1
        else:
            list_sepearte_columns.append(i)
            

In [None]:
list_sepearte_columns  # Thus we see we plotted all columns

# Data Cleaning

### A look in to columns haveing missing value

#### medical_history_2 and medical_history_5 have missing value

In [None]:
df_final.medical_history_2.count()/len(df_final.medical_history_2)


In [None]:
df_final.medical_history_5.count()/len(df_final.medical_history_2)

In [None]:
df_final.medical_history_5.isna().sum()

In [None]:
df_final.medical_history_2.isna().sum()

### in the abouve columns Around 7-9 % rows are missing 

### Above 2 variable can be imputed or can be given as 3rd catergory missing. instead of inputing i will assigng it as another category. OneHotEncoding can be done to get the feature representation for the model. This we will do during model building 

In [None]:
df_final.medical_history_2 = df_final.medical_history_2.fillna("missing")
df_final.medical_history_5 = df_final.medical_history_5.fillna("missing")

### Treatment for "medical_history_3" column

In [None]:
df_final.medical_history_3.value_counts()

###  looking at the distribution and count "No" has count coparable to 0 and "yes" has count comparable to 1 thus using common sense replacing "yes" by 1 and "No" by 1. although its better to confirm with business

In [None]:
 df_final.medical_history_3 =  df_final.medical_history_3.replace({"Yes":'1', "No":'0'}).astype('int8')

In [None]:
 df_final.medical_history_3.value_counts()

### Correcting gender, Race and resedential_status columns

In [None]:
df_final.gender.value_counts()

In [None]:
 df_final.gender =  df_final.gender.replace({"m":1, "f":0, 'Male':1, 'Female':0}).astype('int8')

In [None]:
df_final.gender.value_counts()  ## corrected

In [None]:
df_final.race.value_counts()

In [None]:
mapper = {'Chinese':'Chinese', 'Malay':'Malay', 'chinese':'Chinese', 
          'Indian':'Indian', 'Others':'Others', 'India':'Indian'}

In [None]:
df_final.race =  df_final.race.map(mapper)

In [None]:
df_final.race.value_counts()  ## corrected

In [None]:
df_final.resident_status.value_counts()

In [None]:
mapper = {'Singaporean':'Singaporean', 'Singapore citizen':'Singaporean', 'PR':'PR',
          'Foreigner':'Foreigner'}

In [None]:
df_final.resident_status =  df_final.resident_status.replace(mapper)

In [None]:
df_final.resident_status.value_counts() ## corrected

# Feature Engineering

In [None]:
df_final.date_of_admission = pd.to_datetime(df_final.date_of_admission, format = "%Y-%m-%d")
df_final.date_of_discharge = pd.to_datetime(df_final.date_of_discharge, format = "%Y-%m-%d")
df_final.date_of_birth = pd.to_datetime(df_final.date_of_birth, format = "%Y-%m-%d")

### Geeting Age of the patients

In [None]:
df_final['Age'] = ((df_final.date_of_admission - df_final.date_of_birth).dt.days)/365

In [None]:
df_final['Age'].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)  # looks okay

### Number of days paitent has stayed

In [None]:
df_final['Stay_days'] = (df_final.date_of_discharge - df_final.date_of_admission).dt.days

In [None]:
df_final['Stay_days'].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)

### BMI

In [None]:
df_final['BMI'] = df_final.weight / (df_final.height/100)**2

In [None]:
df_final['BMI'].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)

### Seasonal Features

In [None]:
df_final['month'] = df_final['date_of_discharge'].dt.month
df_final['weekofyear'] = df_final['date_of_discharge'].dt.weekofyear

In [None]:
df_final['month'].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)

In [None]:
df_final['weekofyear'].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)

In [None]:
df_final.columns

In [None]:
cat_features = [ 'patient_id', 'medical_history_1', 'medical_history_2',
       'medical_history_3', 'medical_history_4', 'medical_history_5',
       'medical_history_6', 'medical_history_7', 'preop_medication_1',
       'preop_medication_2', 'preop_medication_3', 'preop_medication_4',
       'preop_medication_5', 'preop_medication_6', 'symptom_1', 'symptom_2',
       'symptom_3', 'symptom_4', 'symptom_5', 'gender', 'race', 'resident_status',
       'month', 'weekofyear']    # categoriclal features

In [None]:
numerical_features = ['lab_result_1', 'lab_result_2',
       'lab_result_3', 'weight', 'height', 'Stay_days', 'BMI', 'Age', 'amount'
       ]     # Numerical features

In [None]:
len(cat_features)  # categorical features count

In [None]:
len(numerical_features)  # numerical features count

###  Checking statistics of numerical columns

In [None]:
df_final[numerical_features].describe()

### Checking the correlation b/w numerical columns and target

In [None]:
df_final[numerical_features].corr()

### Plotting Heatmap of Correlations

In [None]:
plt.figure(figsize = (12,8))
ax = sns.heatmap(df_final[numerical_features].corr(), annot=True, 
                 cmap=plt.cm.Blues, linecolor ='black',  
                 linewidths=1) 
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.tight_layout()
plt.show()

#### Looking at the above numbers it appears to be no linear correlation (except Age) between target and the numerical features. thus nonlinear model might do a better job  

## As we saw in the EDA that target distribution is extremly skewed. Thus log tranformation (of box - cox) transformation will be good idea to ensure train test split is less biased. here we are doing log transform

In [None]:
df_final['target_transformed'] = np.log1p(df_final.amount)   # adding 1 plus to take care of target having value zero

In [None]:
df_final['target_transformed'].hist(color = 'orange', edgecolor = 'black', figsize = (10,8), bins = 25)

#### Binning target to ensure startified split the data into train and test

In [None]:
df_final['target_bins'] = pd.cut(df_final['target_transformed'] ,bins = 5).astype('object')  
#this will be used in splitting the train and test

In [None]:
df_final['target_bins'].value_counts()

### Removing non relevent columns for modeling

In [None]:
df_final.drop(columns = ['date_of_admission', 'date_of_discharge', 'date_of_birth'], inplace = True)

### Saving the flat file for model building 

In [None]:
df_final.to_csv("created_flat_file.csv", index = False)

### Remarks : Looking at the number of categorical features and past experience CatBoost will be appropriate for this data. Although we will build linear model baseline

In [None]:
df_final.columns

## Analysing patinet_id

In [None]:
print(f"Unique Percentage of patinet_id : {df_final.patient_id.nunique()*100/len(df_final.patient_id)}")

#### Given the repeating nature of patinet, patient_id can be used for model building.  while production two model can be used one witout patient_id and other whith patient_id. when a new patient is coming accordingly prdiction can be use. In this case i am going to build a model with patient_id as a feature. 

# More EDA on cleaned Data

## Exploring columns having binary value and its effect on target  

We are looking at the distibution of target for different value for binary columns (ie. 0 and 1) In the below plots blue is the distibution of target when feature value is equal to 1 and orange is distibution of target when feature value is 0.  Here we are using Log transformed distibution of target because it is very difficult to distigush the distibution of orginal target value

### If there is significant difference in the distibution means the feature will be imporantant in the model

In [None]:
binary_features  = [  'medical_history_1', 'medical_history_2',
       'medical_history_3', 'medical_history_4', 'medical_history_5',
       'medical_history_6', 'medical_history_7', 'preop_medication_1',
       'preop_medication_2', 'preop_medication_3', 'preop_medication_4',
       'preop_medication_5', 'preop_medication_6', 'symptom_1', 'symptom_2',
       'symptom_3', 'symptom_4', 'symptom_5', 'gender']



for i in range(0, len(binary_features), 2):
    
    
    fig, axarr = plt.subplots(1, 2, figsize=(16, 8))
    try:   
        df_final[df_final[binary_features[i]] == 1]['target_transformed'].hist(density = True, ax = axarr[0])
        df_final[df_final[binary_features[i]] == 0]['target_transformed'].hist(density = True,
                                                                               alpha = .5,
                                                                               ax = axarr[0])
        axarr[0].title.set_text("Histograms of Target for " + '"'+ binary_features[i] +   '"' + " : 0 and 1")
        axarr[0].set_xlabel("Log transformed target Values")
        axarr[0].set_ylabel("Normalized Frequency")
        
        #plt.show()

        df_final[df_final[binary_features[i+1]] == 1]['target_transformed'].hist(density = True, ax = axarr[1])
        df_final[df_final[binary_features[i+1]] == 0]['target_transformed'].hist(density = True,alpha = .5,
                                                                                 ax = axarr[1])
        axarr[1].title.set_text("Histograms of Target for " + '"'+ binary_features[i+1] +   '"' + " : 0 and 1")
        axarr[1].set_xlabel("Log transformed target Values")
        axarr[1].set_ylabel("Normalized Frequency")
        axarr[1].tight_layout()

        #plt.show()
    except:
         plt.show()