In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import arff
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

For the diabetes dataset https://www.hindawi.com/journals/bmri/2014/781670/tab1/

In [2]:
#dictionaries with key: dataset, value: list of lists, 0: continuos variables, 1: discrete variables
columns_type_dataset = {'yeast':[['Att1', 'Att2', 'Att3', 'Att4', 'Att5', 'Att6', 'Att7', 'Att8', 'Att9', 'Att10', 'Att11', 'Att12', 'Att13', 'Att14', 'Att15', 'Att16', 'Att17', 'Att18', 'Att19', 'Att20', 'Att21', 'Att22', 'Att23', 'Att24', 'Att25', 'Att26', 'Att27', 'Att28', 'Att29', 'Att30', 'Att31', 'Att32', 'Att33', 'Att34', 'Att35', 'Att36', 'Att37', 'Att38', 'Att39', 'Att40', 'Att41', 'Att42', 'Att43', 'Att44', 'Att45', 'Att46', 'Att47', 'Att48', 'Att49', 'Att50', 'Att51', 'Att52', 'Att53', 'Att54', 'Att55', 'Att56', 'Att57', 'Att58', 'Att59', 'Att60', 'Att61', 'Att62', 'Att63', 'Att64', 'Att65', 'Att66', 'Att67', 'Att68', 'Att69', 'Att70', 'Att71', 'Att72', 'Att73', 'Att74', 'Att75', 'Att76', 'Att77', 'Att78', 'Att79', 'Att80', 'Att81', 'Att82', 'Att83', 'Att84', 'Att85', 'Att86', 'Att87', 'Att88', 'Att89', 'Att90', 'Att91', 'Att92', 'Att93', 'Att94', 'Att95', 'Att96', 'Att97', 'Att98', 'Att99', 'Att100', 'Att101', 'Att102', 'Att103'],[]],\
                       'diabete':[['age','time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses'],['race','gender','admission_type_id','discharge_disposition_id','admission_source_id','max_glu_serum','A1Cresult','diabetesMed','metformin','repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide','pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton','insulin','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone','change','readmitted','diag_1','diag_2','diag_3']],\
                       'woman':[['n_0067','n_0078','n_0108','n_0109','o_0176','o_0264'],['release','n_0047','n_0050','n_0052','n_0061','n_0075','n_0091','c_0466','c_0500','c_0638','c_0699','c_0738','c_0761','c_0770','c_0838','c_0870','c_0980','c_1145','c_1158','c_1189','c_1223','c_1227','c_1244','c_1259']]}

# Yeast Dataset

In [None]:
df_yeast = pd.DataFrame(arff.loadarff('../dataset/dataset_raw/yeast.arff')[0])

for col in df_yeast.columns[-14:]:
    df_yeast[col] = df_yeast[col].apply(pd.to_numeric)

In [None]:
df_yeast.head()

In [None]:
cols_Y = [col for col in df_yeast.columns if col.startswith('Class')]
cols_X = [col for col in df_yeast.columns if col not in cols_Y]

X = df_yeast[cols_X].values
y = df_yeast[cols_Y].values

In [None]:
X_bb, X_2e, y_bb, y_2e = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
df_bb = pd.DataFrame(data=np.concatenate((X_bb, y_bb), axis=1), columns=df_yeast.columns)
df_2e = pd.DataFrame(data=np.concatenate((X_2e, y_2e), axis=1), columns=df_yeast.columns)

In [None]:
df_bb.to_csv('../dataset/yeast_bb.csv', sep=',', index=False)
df_2e.to_csv('../dataset/yeast_2e.csv', sep=',', index=False)

# Diabete Dataset

Given the description of the variables in https://www.hindawi.com/journals/bmri/2014/781670/tab1/, we decided to drop those with high percentages of missing values and those who were not important for the classification (patient id or encounter id), the target variables are the three types of diagnosis (primary, secondary and additional secondary diagnosis)

In [3]:
df_diabete = pd.read_csv('../dataset/dataset_raw/diabetic_data.csv', sep=',', skipinitialspace=True, na_values='?')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
mv = df_diabete.isnull().sum(axis=0)
for k, v in zip(mv.index, mv.values):
    if v > 0:
        print(k, v, '%.2f' % (v/len(df_diabete)))

race 2273 0.02
weight 98569 0.97
payer_code 40256 0.40
medical_specialty 49949 0.49
diag_1 21 0.00
diag_2 358 0.00
diag_3 1423 0.01


In [5]:
df_diabete = df_diabete.drop(['encounter_id','patient_nbr','weight','payer_code','medical_specialty'],1)

In [6]:
len(df_diabete),len(df_diabete.dropna())

(101766, 98053)

In [7]:
df_diabete = df_diabete.dropna().reset_index().drop('index',1)

In [8]:
df_diabete = df_diabete.sample(n=14644,random_state=0).reset_index().drop('index',1)

In [9]:
diag_1_numerosity = df_diabete.groupby('diag_1').size()
frequent_diag1 = diag_1_numerosity[diag_1_numerosity.values>np.percentile(diag_1_numerosity.values,75)].index.values

diag_2_numerosity = df_diabete.groupby('diag_2').size()
frequent_diag2 = diag_2_numerosity[diag_2_numerosity.values>np.percentile(diag_2_numerosity.values,75)].index.values

diag_3_numerosity = df_diabete.groupby('diag_3').size()
frequent_diag3 = diag_3_numerosity[diag_3_numerosity.values>np.percentile(diag_3_numerosity.values,75)].index.values

In [13]:
df_diabete=df_diabete[(df_diabete.diag_1.isin(frequent_diag1))&(df_diabete.diag_2.isin(frequent_diag2))&(df_diabete.diag_3.isin(frequent_diag3))].reset_index().drop('index',1)

In [14]:
df_diabete.shape

(10670, 45)

In [19]:
df_diabete[['diag_1','diag_2','diag_3']].head()

Unnamed: 0,diag_1,diag_2,diag_3
0,414.0,425,466.0
1,996.0,403,585.0
2,250.12,276,250.6
3,250.6,682,707.0
4,414.0,428,250.01


In [16]:
#age diventa la metà del range (var continua)
df_diabete['age']=df_diabete.age.apply(lambda x: float(x.strip('[').strip(')').split('-')[1])-5)

In [17]:
len(columns_type_dataset['diabete'][1]+columns_type_dataset['diabete'][0]),len(df_diabete.columns)

(45, 45)

In [20]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories='auto')
encoded_dataframes = []
categorical_variables_diabetes = []

for col in columns_type_dataset['diabete'][1]:
    enc.fit(df_diabete[col].values.reshape(-1,1))
    categories_names = enc.categories_
    columns_names = [col+'='+str(name) for name in categories_names[0]]
    values_encoded = enc.transform(df_diabete[col].values.reshape(-1, 1)).toarray()
    encoded_dataframes.append(pd.DataFrame(values_encoded, columns=columns_names))
    categorical_variables_diabetes.append(columns_names)

categorical_variables_diabetes_names = [item for sublist in categorical_variables_diabetes for item in sublist]


diabete_encoded = pd.concat([pd.concat(encoded_dataframes,1),df_diabete[columns_type_dataset['diabete'][0]]],1)
diabete_encoded.head()

Unnamed: 0,race=AfricanAmerican,race=Asian,race=Caucasian,race=Hispanic,race=Other,gender=Female,gender=Male,admission_type_id=1,admission_type_id=2,admission_type_id=3,...,diag_3=V58,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,75.0,3,66,1,18,0,0,1,9
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,75.0,2,29,2,15,1,0,3,9
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,45.0,4,56,0,11,0,0,1,7
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,65.0,3,27,0,15,0,0,0,8
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,75.0,3,75,0,15,0,0,0,6


In [32]:
s = diabete_encoded[target_cols].loc[0]
s.iloc[s.nonzero()[0]]

diag_1=414    1.0
diag_2=425    1.0
diag_3=466    1.0
Name: 0, dtype: float64

In [33]:
target_cols = [col for col in diabete_encoded.columns if 'diag_' in col]
for i in diabete_encoded.index.values:
    if len(diabete_encoded[target_cols].loc[i].nonzero()[0]) != 3:
        print(i)

In [34]:
columns_type_dataset['diabete'][1] = [i for i in categorical_variables_diabetes_names if 'diag_' not in i]

In [35]:
cols_Y = [col for col in diabete_encoded.columns if col.startswith('diag_')]
cols_X = [col for col in diabete_encoded.columns if col not in cols_Y]

X = diabete_encoded[cols_X].values
y = diabete_encoded[cols_Y].values

In [36]:
diabete_encoded[cols_Y].values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
print('number of columns: %d \nsum of cols X + cols Y: %d \nnumber of features: %d \nnumber of target cols: %d' %(len(diabete_encoded.columns),len(cols_X)+len(cols_Y),len(cols_X),len(cols_Y)))

number of columns: 511 
sum of cols X + cols Y: 511 
number of features: 134 
number of target cols: 377


In [42]:
X_bb, X_2e, y_bb, y_2e = train_test_split(X, y, test_size=0.3, random_state=0)

In [43]:
print(X_bb.shape)
print(y_bb.shape)

(7469, 134)
(7469, 377)


In [44]:
df_bb = pd.concat([pd.DataFrame(data=X_bb,columns=cols_X),pd.DataFrame(data=y_bb,columns=cols_Y)],1)
df_2e = pd.concat([pd.DataFrame(data=X_2e,columns=cols_X),pd.DataFrame(data=y_2e,columns=cols_Y)],1)

In [45]:
df_bb.to_csv('../dataset/diabete_bb.csv', sep=',', index=False)
df_2e.to_csv('../dataset/diabete_2e.csv', sep=',', index=False)

# Woman Health Care

In [None]:
woman = pd.read_csv('../dataset/dataset_raw/women_health_care.csv', sep=',')

In [None]:
woman.head()

In [None]:
len(woman)

In [None]:
mv = woman.isnull().sum(axis=0)

columns2drop = list()
for k, v in zip(mv.index, mv.values):
    if v != 0.0:
        columns2drop.append(k)

In [None]:
woman.drop(columns2drop, axis=1, inplace=True)

In [None]:
enc = OneHotEncoder(categories='auto')
encoded_dataframes = []
categorical_variables_woman = []

for col in columns_type_dataset['woman'][1]:
    enc.fit(woman[col].values.reshape(-1,1))
    categories_names = enc.categories_
    columns_names = [col+str(name) for name in categories_names[0]]
    values_encoded = enc.transform(woman[col].values.reshape(-1, 1)).toarray()
    encoded_dataframes.append(pd.DataFrame(values_encoded, columns=columns_names))
    categorical_variables_woman.append(columns_names)

categorical_variables_woman_names = [item for sublist in categorical_variables_woman for item in sublist]

woman_encoded = pd.concat([pd.concat(encoded_dataframes,1),woman.drop(columns_type_dataset['woman'][1],1)],1)
woman_encoded.head()

In [None]:
columns_type_dataset['woman'][1] = categorical_variables_woman_names

In [None]:
df_hc_label = pd.read_csv('../dataset/dataset_raw/women_health_care_labels.csv', sep=',')

In [None]:
df_hc_label.head()

In [None]:
len(df_hc_label)

In [None]:
df_hc = woman_encoded.set_index('id').join(df_hc_label.set_index('id'), how='inner').reset_index().drop('id',1)

In [None]:
cols_Y = [col for col in df_hc.columns if col.startswith('service')]
cols_X = [col for col in df_hc.columns if col not in cols_Y]

X = df_hc[cols_X].values
y = df_hc[cols_Y].values

In [None]:
X_bb, X_2e, y_bb, y_2e = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
df_bb = pd.DataFrame(data=np.concatenate((X_bb, y_bb), axis=1), columns=df_hc.columns)
df_2e = pd.DataFrame(data=np.concatenate((X_2e, y_2e), axis=1), columns=df_hc.columns)

In [None]:
df_bb.to_csv('../dataset/woman_bb.csv', sep=',', index=False)
df_2e.to_csv('../dataset/woman_2e.csv', sep=',', index=False)

#### Save dictionary of varibles names divided in categorical and continuous ones for later use

In [None]:
#columns_type_dataset.pop('diabete')

In [None]:
with open('../dataset/dict_names.pickle', 'wb') as handle:
    pickle.dump(columns_type_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)