In [19]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import arff

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Yeast Dataset

In [4]:
df_yeast = pd.DataFrame(arff.loadarff('../dataset/yeast.arff')[0])

for col in df_yeast.columns[-14:]:
    df_yeast[col] = df_yeast[col].apply(pd.to_numeric)

In [5]:
df_yeast.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0


In [6]:
cols_Y = [col for col in df_yeast.columns if col.startswith('Class')]
cols_X = [col for col in df_yeast.columns if col not in cols_Y]

X = df_yeast[cols_X].values
y = df_yeast[cols_Y].values

In [7]:
X_bb, X_2e, y_bb, y_2e = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
df_bb = pd.DataFrame(data=np.concatenate((X_bb, y_bb), axis=1), columns=df_yeast.columns)
df_2e = pd.DataFrame(data=np.concatenate((X_2e, y_2e), axis=1), columns=df_yeast.columns)

In [9]:
df_bb.to_csv('../dataset/yeast_bb.csv', sep=',', index=False)
df_2e.to_csv('../dataset/yeast_2e.csv', sep=',', index=False)

# Diabete Dataset

In [114]:
df_diabete = pd.read_csv('../dataset/diabetic_data.csv', sep=',', skipinitialspace=True, na_values='?')

  interactivity=interactivity, compiler=compiler, result=result)


In [115]:
len(df_diabete)

101766

In [116]:
mv = df_diabete.isnull().sum(axis=0)
for k, v in zip(mv.index, mv.values):
    if v > 0:
        print(k, v, '%.2f' % (v/len(df_diabete)))

race 2273 0.02
weight 98569 0.97
payer_code 40256 0.40
medical_specialty 49949 0.49
diag_1 21 0.00
diag_2 358 0.00
diag_3 1423 0.01


In [117]:
np.unique(df_diabete['readmitted'], return_counts=True)

(array(['<30', '>30', 'NO'], dtype=object), array([11357, 35545, 54864]))

In [118]:
df_diabete['readmitted-<30'] = [1 if x == '<30' else 0 for x in df_diabete['readmitted']]
df_diabete['readmitted->30'] = [1 if x == '>30' else 0 for x in df_diabete['readmitted']]
df_diabete['readmitted-NO'] = [1 if x == 'NO' else 0 for x in df_diabete['readmitted']]

In [119]:
for col in ['race', 'diag_1', 'diag_2', 'diag_3']:
    df_diabete[col] = df_diabete[col].fillna(df_diabete[col].mode()[0])

In [120]:
columns2drop = ['weight', 'payer_code', 'medical_specialty', 'encounter_id', 'patient_nbr', 'readmitted']
df_diabete.drop(columns2drop, axis=1, inplace=True)

In [121]:
df_diabete.reset_index(inplace=True, drop=True)

In [122]:
df_diabete.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted-<30,readmitted->30,readmitted-NO
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,0,0,1
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,No,No,No,Ch,Yes,0,1,0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,Yes,0,0,1
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,No,No,No,Ch,Yes,0,0,1
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,No,No,No,Ch,Yes,0,0,1


In [123]:
def binarize_features(df):
    features2binarize = list()
    for idx, col in enumerate(df.columns):
        dtype = df[col].dtype
        if dtype != np.float64:
            if dtype.kind == 'O':
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])
                if len(le.classes_) > 2:
                    features2binarize.append(idx)
    return df, features2binarize

In [124]:
df_diabete, f2b = binarize_features(df_diabete)

In [126]:
cols_Y = [col for col in df_diabete.columns if col.startswith('readmitted')]
cols_X = [col for col in df_diabete.columns if col not in cols_Y]

X = df_diabete[cols_X].values
y = df_diabete[cols_Y].values

In [127]:
X_bb, X_2e, y_bb, y_2e = train_test_split(X, y, test_size=0.3, random_state=0)

In [129]:
df_bb = pd.DataFrame(data=np.concatenate((X_bb, y_bb), axis=1), columns=df_diabete.columns)
df_2e = pd.DataFrame(data=np.concatenate((X_2e, y_2e), axis=1), columns=df_diabete.columns)

In [130]:
df_bb.to_csv('../dataset/diabete_bb.csv', sep=',', index=False)
df_2e.to_csv('../dataset/diabete_2e.csv', sep=',', index=False)

# Woman Health Care

In [131]:
df_hc = pd.read_csv('../dataset/women_health_care.csv', sep=',')

  interactivity=interactivity, compiler=compiler, result=result)


In [132]:
df_hc.head()

Unnamed: 0,id,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,...,c_1368,c_1369,c_1370,c_1371,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
0,11193,a,,,0.025449,,,0.368421,,,...,,,,,a,,q,,,
1,11382,a,,,0.031297,,,0.315789,,,...,,,a,,a,,,,,
2,16531,a,,,0.024475,,,0.342105,,,...,,,a,,a,,b,,,
3,1896,a,,,0.041694,,,0.447368,,,...,,,,,a,,,,,
4,18262,c,,,0.03812,,,0.315789,,,...,,,b,,a,,a,,,


In [133]:
len(df_hc)

14644

In [134]:
mv = df_hc.isnull().sum(axis=0)

columns2drop = list()
for k, v in zip(mv.index, mv.values):
    if v != 0.0:
        columns2drop.append(k)
        #print(k, v, '%.2f' % (v/len(df_hc)))

In [135]:
df_hc.drop(columns2drop, axis=1, inplace=True)

In [136]:
df_hc.head()

Unnamed: 0,id,release,n_0047,n_0050,n_0052,n_0061,n_0067,n_0075,n_0078,n_0091,...,c_0838,c_0870,c_0980,c_1145,c_1158,c_1189,c_1223,c_1227,c_1244,c_1259
0,11193,a,1,1,1,1,0.928571,1,0.8,1,...,a,b,c,b,g,b,c,a,d,n
1,11382,a,1,1,1,1,0.928571,1,0.666667,1,...,a,b,c,b,g,b,a,a,d,e
2,16531,a,1,1,1,1,0.428571,1,0.833333,1,...,a,b,c,b,j,b,c,a,d,w
3,1896,a,1,1,1,1,0.571429,1,0.566667,1,...,b,b,c,b,e,b,c,a,d,e
4,18262,c,1,1,1,1,0.928571,1,0.6,1,...,a,a,c,c,b,b,c,a,d,e


In [137]:
df_hc_label = pd.read_csv('../dataset/women_health_care_labels.csv', sep=',')

In [138]:
df_hc_label.head()

Unnamed: 0,id,service_a,service_b,service_c,service_d,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
0,11193,1,1,0,0,0,0,0,0,0,1,1,0,0,0
1,11382,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2,16531,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,1896,0,0,0,1,0,0,0,0,0,1,0,1,0,0
4,18262,0,0,0,1,1,0,0,0,0,0,1,1,1,0


In [139]:
len(df_hc_label)

14644

In [140]:
df_hc = df_hc.set_index('id').join(df_hc_label.set_index('id'), how='inner')

In [141]:
df_hc.head()

Unnamed: 0_level_0,release,n_0047,n_0050,n_0052,n_0061,n_0067,n_0075,n_0078,n_0091,n_0108,...,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11193,a,1,1,1,1,0.928571,1,0.8,1,0.8,...,0,0,0,0,0,1,1,0,0,0
11382,a,1,1,1,1,0.928571,1,0.666667,1,0.666667,...,0,0,0,0,0,1,1,0,0,0
16531,a,1,1,1,1,0.428571,1,0.833333,1,0.833333,...,0,0,0,0,0,1,1,0,0,0
1896,a,1,1,1,1,0.571429,1,0.566667,1,0.566667,...,0,0,0,0,0,1,0,1,0,0
18262,c,1,1,1,1,0.928571,1,0.6,1,0.6,...,1,0,0,0,0,0,1,1,1,0


In [142]:
df_hc, f2b = binarize_features(df_hc)

In [143]:
df_hc.head()

Unnamed: 0_level_0,release,n_0047,n_0050,n_0052,n_0061,n_0067,n_0075,n_0078,n_0091,n_0108,...,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11193,0,1,1,1,1,0.928571,1,0.8,1,0.8,...,0,0,0,0,0,1,1,0,0,0
11382,0,1,1,1,1,0.928571,1,0.666667,1,0.666667,...,0,0,0,0,0,1,1,0,0,0
16531,0,1,1,1,1,0.428571,1,0.833333,1,0.833333,...,0,0,0,0,0,1,1,0,0,0
1896,0,1,1,1,1,0.571429,1,0.566667,1,0.566667,...,0,0,0,0,0,1,0,1,0,0
18262,2,1,1,1,1,0.928571,1,0.6,1,0.6,...,1,0,0,0,0,0,1,1,1,0


In [145]:
cols_Y = [col for col in df_hc.columns if col.startswith('service')]
cols_X = [col for col in df_hc.columns if col not in cols_Y]

X = df_hc[cols_X].values
y = df_hc[cols_Y].values

In [146]:
X_bb, X_2e, y_bb, y_2e = train_test_split(X, y, test_size=0.3, random_state=0)

In [147]:
df_bb = pd.DataFrame(data=np.concatenate((X_bb, y_bb), axis=1), columns=df_hc.columns)
df_2e = pd.DataFrame(data=np.concatenate((X_2e, y_2e), axis=1), columns=df_hc.columns)

In [148]:
df_bb.to_csv('../dataset/woman_bb.csv', sep=',', index=False)
df_2e.to_csv('../dataset/woman_2e.csv', sep=',', index=False)

In [85]:
#df_drug = pd.read_csv('../dataset/drug_consumption.csv', sep=',')
#df_drug.head()
#len(df_drug)