In [1]:
import os
import sys
import numpy as np
import pandas as pd
#import pickle
import dill as pickle # https://stackoverflow.com/questions/25348532/can-python-pickle-lambda-functions
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import pyperclip
import joblib
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from utils import variables_mapping
from custom_transformers.transformers import PreProcessingTransformer, DropColumnsTransformer, CustomImputer, variables_mapping

In [2]:
df = pd.read_csv(os.path.join("data", "medicalencounter_202202142036.csv"))

In [3]:
df.dtypes

id                               int64
proba                          float64
prediction                        bool
true_label                      object
admission_id                   float64
patient_id                     float64
race                            object
gender                          object
age                             object
weight                          object
admission_type_code            float64
discharge_disposition_code     float64
admission_source_code            int64
time_in_hospital                 int64
payer_code                      object
medical_specialty               object
has_prosthesis                    bool
complete_vaccination_status     object
num_lab_procedures             float64
num_procedures                   int64
num_medications                float64
number_outpatient                int64
number_emergency                 int64
number_inpatient                 int64
diag_1                          object
diag_2                   

### Remove data without a true label

In [4]:
# Number of observations that did not get their true label
df['true_label'].isna().sum()

200

In [5]:
df = df[~df['true_label'].isna()]

### Calculate Metrics

In [6]:
y_pred = df['prediction'].astype(bool)

In [7]:
y_pred2 = df['proba'] > 0.38

In [8]:
y_true = df['true_label'].astype(bool)

###### Deployed thresold

In [9]:
accuracy_score(y_true, y_pred)

0.6806404044659785

In [10]:
precision_score(y_true, y_pred)

0.18131510416666666

In [11]:
recall_score(y_true, y_pred)

0.5186219739292365

In [12]:
f1_score(y_true, y_pred)

0.2686927158707188

###### Correct thresold

In [13]:
accuracy_score(y_true, y_pred2)

0.45807878660206447

In [14]:
precision_score(y_true, y_pred2)

0.1433327492553005

In [15]:
recall_score(y_true, y_pred2)

0.7616387337057728

In [16]:
f1_score(y_true, y_pred2)

0.2412623506857396

### Remove extra columns and create raw data

In [17]:
df['readmitted'] = df['true_label']
df = df.drop(['id', 'proba', 'prediction', 'true_label'], axis=1)

In [18]:
df.head()

Unnamed: 0,admission_id,patient_id,race,gender,age,weight,admission_type_code,discharge_disposition_code,admission_source_code,time_in_hospital,...,blood_type,hemoglobin_level,blood_transfusion,max_glu_serum,A1Cresult,diuretics,insulin,change,diabetesMed,readmitted
0,100706.0,12224646.0,Caucasian,Male,[40-50),?,2.0,1.0,1,3,...,A+,14.2,False,,>8,No,Yes,No,Yes,False
1,99518.0,1020438.0,African American,Male,[60-70),?,2.0,18.0,2,2,...,O-,14.1,False,,,No,Yes,No,Yes,False
2,89397.0,141934014.0,AfricanAmerican,Female,[60-70),?,3.0,1.0,1,5,...,A+,13.4,False,NONE,,No,Yes,No,Yes,False
3,89653.0,168821964.0,Caucasian,Male,[50-60),?,3.0,1.0,1,1,...,O-,15.3,False,,,Yes,Yes,Ch,Yes,False
4,83278.0,6485868.0,Hispanic,Male,[30-40),?,1.0,3.0,7,14,...,O+,14.6,True,,,No,Yes,Ch,Yes,False


In [19]:
#df.to_csv('data/week1_raw.csv')

In [20]:
# Create profile report
#ProfileReport(df, title="Week 1 raw data").to_file("reports/week1_raw.html")

### Preprocess data

In [21]:
known_categories = variables_mapping()

In [22]:
#df_clean = df.copy()
#for column in known_categories.keys():
#    if column != 'readmitted':
#        df_clean[column] = (df[column]
#                            .apply(lambda value: known_categories[column]['mapping'](value))
#                            .astype(known_categories[column]['type'])
#                           )

In [23]:
#df_clean.to_csv('data/week1_preprocessed.csv')
df_clean = pd.read_csv(os.path.join("data", "week1_preprocessed.csv"))

In [24]:
#ProfileReport(df_clean, title="Week 1 - Preprocessed data").to_file("reports/week1_preprocessed.html")

In [29]:
df_clean.dtypes

Unnamed: 0                       int64
admission_id                   float64
patient_id                     float64
race                            object
gender                          object
age                            float64
weight                         float64
admission_type_code             object
discharge_disposition_code      object
admission_source_code           object
time_in_hospital                 int64
payer_code                      object
medical_specialty               object
has_prosthesis                    bool
complete_vaccination_status     object
num_lab_procedures             float64
num_procedures                   int64
num_medications                float64
number_outpatient                int64
number_emergency                 int64
number_inpatient                 int64
diag_1                          object
diag_2                          object
diag_3                          object
number_diagnoses                 int64
blood_type               

### Compare with old data

In [25]:
df_original = pd.read_csv(os.path.join("data", "preprocessed.csv"))

In [26]:
df_original.admission_id.values

array([    0,     1,     2, ..., 81409, 81410, 81411])

In [27]:
set(df_original.admission_id.values).intersection(df_clean.admission_id.values)

set()

In [31]:
df.medical_specialty.unique()

array(['Endocrinology', '?', 'Family/GeneralPractice', 'Radiologist',
       'Orthopedics', 'InternalMedicine', 'Emergency/Trauma',
       'Pulmonology', 'ObstetricsandGynecology', 'Surgeon', 'Cardiology',
       'Surgery-General', 'Pediatrics', 'Nephrology',
       'PhysicalMedicineandRehabilitation', 'Gastroenterology',
       'Surgery-Cardiovascular/Thoracic', 'Orthopedics-Reconstructive',
       'Hematology/Oncology', 'Surgery-Neuro', 'Psychiatry', 'Neurology',
       'Surgery-Vascular', 'Urology', 'Psychology', 'Hematology',
       'Surgery-Thoracic', 'Surgery-Cardiovascular',
       'Pediatrics-Endocrinology', 'Rheumatology', 'Oncology',
       'Pediatrics-Pulmonology', 'Otolaryngology', 'InfectiousDiseases',
       'DCPTEAM', 'Podiatry', 'Gynecology', 'Radiology', 'Hospitalist',
       'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',
       'Pediatrics-Neurology', 'SurgicalSpecialty', 'Osteopath',
       'Obstetrics', 'Anesthesiology', 'Ophthalmology',
       'Pediatrics-C