In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import pyperclip
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
#from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from utils import variables_mapping

In [2]:
df = pd.read_csv(os.path.join("data", "train_data.csv"))

In [3]:
known_categories = variables_mapping()



In [4]:
known_categories['blood_type']['mapping']('o+')

'O+'

## train/test split

In [5]:
ordered_columns = ['admission_id', 'patient_id', 'race', 'gender', 'age', 'weight',
       'admission_type_code', 'discharge_disposition_code',
       'admission_source_code', 'time_in_hospital', 'payer_code',
       'medical_specialty', 'has_prosthesis', 'complete_vaccination_status',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'blood_type',
       'hemoglobin_level', 'blood_transfusion', 'max_glu_serum', 'A1Cresult',
       'diuretics', 'insulin', 'change', 'diabetesMed']

In [6]:
def create_target(df):
    known_categories = variables_mapping()
    X = df[ordered_columns]
    
    y = (df['readmitted']
     .apply(lambda value: known_categories['readmitted']['mapping'](value))
     .astype(known_categories['readmitted']['type'])
    )
    
    return X, y

In [7]:
X, y = create_target(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modelling

In [8]:
class PreProcessingTransformer(TransformerMixin):
    """
    Applies the preprocessing logic to all columns.
    Sets all variables' dtype and transform the raw input into valid categories.
    The unknown/unexpected values are converted into None
    """
    
    
    def __init__(self):
        self.known_categories = variables_mapping()
        self.known_categories.pop('readmitted', None) # Remove the readmitted key


    def transform(self, X, *_):
        X_clean = X.copy()
        for column in self.known_categories.keys():
            #type_ = 'object' if self.known_categories[column]['type'] == 'category' else self.known_categories[column]['type']
            X_clean[column] = (X[column]
                               .apply(lambda value: self.known_categories[column]['mapping'](value))
                               .astype(self.known_categories[column]['type'])
                              )
        
        return X_clean
    
    def fit(self, *_):
        return self

In [9]:
class DropColumnsTransformer(TransformerMixin):
    """
    Removes columns that won't be used by the predictive model 
    """

    def transform(self, X, *_):
        X_ = X.copy()
        
        # Drop training rows that have a given 'discharge_disposition_code' ??

        return X_.drop([
            'admission_id',
            'patient_id',
            'weight',
            'max_glu_serum',
            'insulin' #change, diabetesMed
        ],
            axis = 1
        )
    
    def fit(self, *_):
        return self

In [10]:
class CustomImputer(TransformerMixin):
    
    def __init__(self):
        self.dict_ = {}
   
    def transform(self, X, *_):
        X_ = X.copy()
        for column in self.dict_.keys():
            X_[column] = self.dict_[column].transform(X[[column]])
            
        return X_
            
        
    def fit(self, X, *_):
        for numeric_column in X.select_dtypes(include=["Int64", "float64"]).columns:
            self.dict_[numeric_column] = SimpleImputer(strategy='mean')
            self.dict_[numeric_column].fit(X[[numeric_column]])
        
        return self
        #for category_column in X.select_dtypes(include=["object"]):
        #    self.dict_[category_column] = SimpleImputer(strategy='most_frequent')
        #    self.dict_[category_column].fit(X[[category_column]])
            

In [22]:
pipeline = make_pipeline(
    PreProcessingTransformer(),
    DropColumnsTransformer(),
    CustomImputer(),
    OneHotEncoder(handle_unknown='ignore'),
    RandomForestClassifier(random_state=42)
    #LogisticRegression(random_state=42)
)

In [23]:
pipeline.fit(X_train, y_train)


Pipeline(steps=[('preprocessingtransformer',
                 <__main__.PreProcessingTransformer object at 0x124aecb50>),
                ('dropcolumnstransformer',
                 <__main__.DropColumnsTransformer object at 0x124af0d10>),
                ('customimputer',
                 <__main__.CustomImputer object at 0x124af0d50>),
                ('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])

In [13]:
#result = pipeline.transform(X_train)

In [None]:
pipeline.classes_

In [37]:
y_pred = pipeline.predict_proba(X_test)[:,1]

In [38]:
pipeline.classes_

array([False,  True])

In [39]:
y_pred > 0.5

array([False, False, False, ..., False, False, False])

In [40]:
precision_score(y_test, y_pred > 0.5)

0.8333333333333334

In [41]:
recall_score(y_test, y_pred > 0.5)

0.002733734281027884

In [None]:
# 1. Balançar os dados para que fique 50-50 entre readmitted e não readmitted (eliminar os corrompidos, mais o que foir preciso)
# 2. Código para descobrir o melhor threshold
# 3. Verificar descriminação, adaptar função abaixo
# 4. Testar vários modelos diferentes, escolher o melhor


In [None]:
def verify_no_discrimination(X_test, y_true, y_pred, sensitive_column='SubjectRaceCode', max_diff=0.05):
    """
    Verifies that no subdeparment has discrimination in between protected races
    
    
    """
    
    departments = X_test['Department Name'].unique()
    sensitive_classes = X_test[sensitive_column].unique()
    
    is_satisfied = True
    problematic_departments = []
    good_deparments = []
    for department in departments:
        precisions = {}
        for sensitive_class in sensitive_classes:
            mask = (X_test[sensitive_column] == sensitive_class) & (X_test['Department Name'] == department)
            if mask.sum():
                precisions[sensitive_class] = precision_score(y_true[mask], y_pred[mask], pos_label=1)
                
        diff = np.max(list(precisions.values())) - np.min(list(precisions.values()))
        if diff > max_diff:
            is_satisfied = False
            problematic_departments.append((department, diff, precisions))
        else:
            good_deparments.append((department, diff, precisions))

    return is_satisfied, problematic_departments, good_deparments
