# Data Cleaning

In [89]:
# import pandas and math

import pandas as pd
import math

#to remove NaN entries
nan = float('nan')

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [90]:
# install the ucimlrepo
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [91]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
diabetes_data = fetch_ucirepo(id=296) 
  
# features and ids as pandas DataFrames
data_ids = diabetes_data.data.ids
data_features = diabetes_data.data.features
data_target = diabetes_data.data.targets


  df = pd.read_csv(data_url)


In [92]:
# add a dummy column to merge
data_ids.loc[:, 'merge_temp'] = data_ids.index
data_features.loc[:, 'merge_temp'] = data_features.index
data_target.loc[:, 'merge_temp'] = data_target.index

# merge into one DataFrame
data_temp = data_ids.merge(data_features, on='merge_temp')

data_raw = data_temp.merge(data_target, on='merge_temp')

del data_temp

data_raw = data_raw.drop(columns = 'merge_temp')

print(data_raw.columns, '\n')

print(data_raw.shape)


Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code',
       'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'readmitted'],
      dtype='object') 

(101766, 50)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_ids.loc[:, 'merge_temp'] = data_ids.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_features.loc[:, 'merge_temp'] = data_features.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_target.loc[:, 'merge_temp'] = data_target.index


In [93]:
# remove missing/unnecesary features
dropped_columns = ['weight', 'payer_code', 'medical_specialty']

data_raw = data_raw.drop(columns = dropped_columns)

print(data_raw.shape)

(101766, 47)


In [94]:
# replace NaNs
data_raw = data_raw.replace(nan,-1)

# drop data points with missing race
data_raw = data_raw.drop(data_raw.loc[data_raw.race == -1].index)

# replace missing diagnoses, A1Cresults, and max_glu_serum values with a string
data_raw[['diag_1', 'diag_2', 'diag_3','A1Cresult','max_glu_serum']] = data_raw[['diag_1', 'diag_2', 'diag_3','A1Cresult','max_glu_serum']].replace(-1,'No')

print(data_raw.shape)

(99493, 47)


In [95]:
# remove data points for expired patients and patients discharged to hospices
# see ./data/IDS_mapping/csv for codes
data_raw = data_raw.drop(data_raw.loc[data_raw.discharge_disposition_id.isin([11, #Expired
                                                                              13, #Hospice/home
                                                                              14, #Hospice/medical facility
                                                                              19, #Expired at home. Medicaid only, hospice
                                                                              20, #Expired in a medical facility. Medicaid only, hospice
                                                                              21  #Expired, place unknown. Medicaid only, hospice
                                                                              ])].index)

print(data_raw.shape)

(97109, 47)


In [96]:
# write cleaned (no missing values) dataset to file
data_raw.to_csv('./data/dataset_clean.csv', index=False)