In [1]:
#libraries needed
import pandas as pd
import numpy as np

#scikit
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn import preprocessing
from sklearn import metrics

In [2]:
df = pd.read_csv("cleaned_data_v5.csv", encoding="Latin-1")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 38 columns):
Unnamed: 0                       99492 non-null int64
race                             99492 non-null object
gender                           99492 non-null object
age                              99492 non-null object
days_in_hospital                 99492 non-null int64
num_lab_procedures               99492 non-null int64
num_not_lab_procedures           99492 non-null int64
num_current_medications          99492 non-null int64
num_outpatient_appointments      99492 non-null int64
num_previous_emergencies         99492 non-null int64
num_inpatient_overnight_stays    99492 non-null int64
number_diagnoses                 99492 non-null int64
metformin                        99492 non-null object
repaglinide                      99492 non-null object
nateglinide                      99492 non-null object
chlorpropamide                   99492 non-null object
glimepiride         

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
#renaming 'age'
#[70-80) -> 70
#[60-70) -> 60
#[50-60) -> 90
#[80-90) -> 80
#[40-50) -> 40
#[30-40) -> 30
#[90-100) -> 90
#[20-30) -> 20
#[10-20) -> 10
#[0-10)-> 0
age_mapping = {'[70-80)': 70, '[60-70)': 60, '[50-60)':90, '[80-90)':80, '[40-50)': 40, '[30-40)': 30,'[90-100)': 90, '[20-30)': 20, '[10-20)': 10, '[0-10)': 0}
df.age = df.age.replace(age_mapping)

In [6]:
list_of_meds = ['metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

# renaming values in medicines
# change in dosage('dosage_increased', 'dosage_decreased') -> 1
# no change or not prescribed 'not_prescribed', no_change_dosage -> 0
new_values_meds = {"dosage_increased":3, "dosage_decreased":1 ,"no_change_dosage":2, "not_prescribed":0}
for medicine in list_of_meds:
    df[medicine] = df[medicine].replace(new_values_meds)

In [7]:
# renaming 'readmitted'
# readmitted ('>30', '<30') -> 1
# not readmitted  ('NO') -> 0
df.readmitted = df.readmitted.replace({'NO': 0, '>30': 2, '<30':1})

In [8]:
# renaming 'diabetes_med_prescribed'
# 'Yes' -> 1 
# 'No' -> 0
df = df.rename(columns={'diabates_med_prescribed': 'diabetes_med_prescribed'})
df.diabetes_med_prescribed = df.diabetes_med_prescribed.replace({'Yes': 1, 'No':0})

In [9]:
# renaming 'change_in_meds'
# 'Ch' -> 1
# 'No' -> 0
df.change_in_meds = df.change_in_meds.replace({'Ch': 1, 'No': 0})

Using Label Encoder for gender and race

In [10]:
#before
print(df.race.value_counts())
print(df.gender.value_counts())

Caucasian          76099
AfricanAmerican    19210
Hispanic            2037
Other               1505
Asian                641
Name: race, dtype: int64
Female    53575
Male      45917
Name: gender, dtype: int64


In [11]:
x = df.columns.values
le_gender = preprocessing.LabelEncoder()
le_race = preprocessing.LabelEncoder()
x_gender = le_gender.fit_transform(df.gender.values)
x_race = le_race.fit_transform(df.race.values)
df['gender'] = x_gender
df['race'] = x_race

In [12]:
def value_counter(dataset):
    for index in dataset.columns:
        print("---- Index: " + index + "----")
        print(dataset[index].value_counts(ascending=True))
value_counter(df)

---- Index: race----
1      641
4     1505
3     2037
0    19210
2    76099
Name: race, dtype: int64
---- Index: gender----
1    45917
0    53575
Name: gender, dtype: int64
---- Index: age----
0       160
10      682
20     1611
30     3699
40     9465
80    16800
90    19619
60    21988
70    25468
Name: age, dtype: int64
---- Index: days_in_hospital----
14     1018
13     1189
12     1427
11     1816
10     2291
9      2940
8      4297
7      5720
6      7366
5      9763
4     13605
1     13877
2     16841
3     17342
Name: days_in_hospital, dtype: int64
---- Index: num_lab_procedures----
121       1
120       1
129       1
118       1
107       1
132       1
126       1
114       2
104       3
113       3
111       3
108       4
109       4
106       5
105       6
103       6
102       8
99        9
100      13
101      13
98       25
96       28
97       31
95       44
94       45
92       46
93       56
91       61
90       65
89       71
       ... 
63     1418
60     1561
59    

In [13]:
#discretising number_diagnoses
df['number_diagnoses'] = np.where(df['number_diagnoses'].between(0,4), 0, df['number_diagnoses'])
df['number_diagnoses'] = np.where(df['number_diagnoses'].between(5,8), 1, df['number_diagnoses'])
df['number_diagnoses'] = np.where(df['number_diagnoses'].between(9,12), 2, df['number_diagnoses'])
df['number_diagnoses'] = np.where(df['number_diagnoses'] > 12, 3, df['number_diagnoses'])

#Source for all masking:
# https://stackoverflow.com/questions/46168450/replace-a-specific-range-of-values-in-a-pandas-dataframe

In [14]:
#discretising num_lab_procedures
df['num_lab_procedures'] = np.where(df['num_lab_procedures'].between(0,20), 0, df['num_lab_procedures'])
df['num_lab_procedures'] = np.where(df['num_lab_procedures'].between(21,40), 1, df['num_lab_procedures'])
df['num_lab_procedures'] = np.where(df['num_lab_procedures'].between(41,60), 2, df['num_lab_procedures'])
df['num_lab_procedures'] = np.where(df['num_lab_procedures'].between(61,80), 3, df['num_lab_procedures'])
df['num_lab_procedures'] = np.where(df['num_lab_procedures'] > 80, 4, df['num_lab_procedures'])

In [15]:
# discretising days_in_hospital
df['days_in_hospital'] = np.where(df['days_in_hospital'].between(0,10), 0, df['days_in_hospital'])
df['days_in_hospital'] = np.where(df['days_in_hospital'] > 10, 1, df['days_in_hospital'])

In [16]:
# discretising num_current_medications
df['num_current_medications'] = np.where(df['num_current_medications'].between(0,20), 0, df['num_current_medications'])
df['num_current_medications'] = np.where(df['num_current_medications'].between(21,40), 1, df['num_current_medications'])
df['num_current_medications'] = np.where(df['num_current_medications'].between(41,60), 2, df['num_current_medications'])
df['num_current_medications'] = np.where(df['num_current_medications'] > 60, 3, df['days_in_hospital'])

In [17]:
# discretising num_inpatient_overnight_stays
df['num_inpatient_overnight_stays'] = np.where(df['num_inpatient_overnight_stays'].between(0,10), 0, df['num_inpatient_overnight_stays'])
df['num_inpatient_overnight_stays'] = np.where(df['num_inpatient_overnight_stays'].between(11,20), 1, df['num_inpatient_overnight_stays'])
df['num_inpatient_overnight_stays'] = np.where(df['num_inpatient_overnight_stays'] > 20, 2, df['num_inpatient_overnight_stays'])

In [18]:
# discretising num_previous_emergencies
df['num_previous_emergencies'] = np.where(df['num_previous_emergencies'].between(0,20), 0, df['num_previous_emergencies'])
df['num_previous_emergencies'] = np.where(df['num_previous_emergencies'].between(21,40), 1, df['num_previous_emergencies'])
df['num_previous_emergencies'] = np.where(df['num_previous_emergencies'].between(41,60), 2, df['num_previous_emergencies'])
df['num_previous_emergencies'] = np.where(df['num_previous_emergencies'] > 60, 3, df['num_previous_emergencies'])

In [19]:
# discretising num_outpatient_appointments
df['num_outpatient_appointments'] = np.where(df['num_outpatient_appointments'].between(0,20), 0, df['num_outpatient_appointments'])
df['num_outpatient_appointments'] = np.where(df['num_outpatient_appointments'].between(21,40), 1, df['num_outpatient_appointments'])
df['num_outpatient_appointments'] = np.where(df['num_outpatient_appointments'] > 40, 2, df['num_outpatient_appointments'])

In [20]:
value_counter(df)

---- Index: race----
1      641
4     1505
3     2037
0    19210
2    76099
Name: race, dtype: int64
---- Index: gender----
1    45917
0    53575
Name: gender, dtype: int64
---- Index: age----
0       160
10      682
20     1611
30     3699
40     9465
80    16800
90    19619
60    21988
70    25468
Name: age, dtype: int64
---- Index: days_in_hospital----
1     5450
0    94042
Name: days_in_hospital, dtype: int64
---- Index: num_lab_procedures----
4     1907
0    14277
3    17084
1    27092
2    39132
Name: num_lab_procedures, dtype: int64
---- Index: num_not_lab_procedures----
5     2984
4     4098
6     4825
3     9269
2    12488
1    20328
0    45500
Name: num_not_lab_procedures, dtype: int64
---- Index: num_current_medications----
3       95
1     5411
0    93986
Name: num_current_medications, dtype: int64
---- Index: num_outpatient_appointments----
2        1
1       38
0    99453
Name: num_outpatient_appointments, dtype: int64
---- Index: num_previous_emergencies----
2        3
3

In [21]:
new_df_1 = pd.get_dummies(df)

In [23]:
#after
print(df.gender.value_counts())
print(df.race.value_counts())

0    53575
1    45917
Name: gender, dtype: int64
2    76099
0    19210
3     2037
4     1505
1      641
Name: race, dtype: int64


In [24]:
new_df_1.to_csv("cleaned_data_BN_v2.csv")

In [25]:
new_df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 37 columns):
race                             99492 non-null int64
gender                           99492 non-null int64
age                              99492 non-null int64
days_in_hospital                 99492 non-null int64
num_lab_procedures               99492 non-null int64
num_not_lab_procedures           99492 non-null int64
num_current_medications          99492 non-null int64
num_outpatient_appointments      99492 non-null int64
num_previous_emergencies         99492 non-null int64
num_inpatient_overnight_stays    99492 non-null int64
number_diagnoses                 99492 non-null int64
metformin                        99492 non-null int64
repaglinide                      99492 non-null int64
nateglinide                      99492 non-null int64
chlorpropamide                   99492 non-null int64
glimepiride                      99492 non-null int64
acetohexamide              

Sources used:
- Sebastian Raschka, 2015. Python Machine Learning.
- Dmitry Zinoviev, 2016. Data Science Essentials in Python.
- Validation and Model Selection notebook from week 7
- https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn
- https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer
- https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/
- https://medium.com/30-days-of-machine-learning/day-3-k-nearest-neighbors-and-bias-variance-tradeoff-75f84d515bdb
- https://stackoverflow.com/questions/46168450/replace-a-specific-range-of-values-in-a-pandas-dataframe
