In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.datasets import load_breast_cancer

In [None]:
cancer = load_breast_cancer()

## Make data myself

In [None]:
# or
df = pd.read_csv('breast-cancer-wisconsin-data.csv', nrows=1)
columns = df.columns.tolist()
cols_to_use = columns[:len(columns)-1]
df = pd.read_csv('breast-cancer-wisconsin-data.csv', usecols=cols_to_use, delimiter=',')
df

In [None]:
df.set_index('id', inplace=True)

In [None]:
df['diagnosis'] = df['diagnosis'].astype('category').cat.codes
df

In [None]:
cat_values =df.columns[1:]

In [None]:
for cat in cat_values:
    # give each category a one-hot-encoded vector
    df = df.join(pd.get_dummies(df[cat], prefix=cat + "_"))
    df = df.drop(cat, axis = 1)

In [None]:
df.head()

In [None]:
y = df['diagnosis'].values
X = df.drop('diagnosis', axis=1).values

## -- automade

In [None]:
cancer.keys()

In [None]:
# Print full description by running:
# print(cancer['DESCR'])
# 569 data points with 30 features
cancer['data'].shape

In [None]:
X = cancer['data']
y = cancer['target']

In [None]:
import pandas as pd
pd.DataFrame(y)

In [10]:
# This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.
# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

In [None]:
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30), verbose=1)

In [12]:
mlp.fit(X_train,y_train)

Iteration 1, loss = 0.23952156
Iteration 2, loss = 0.16437592
Iteration 3, loss = 0.14293296
Iteration 4, loss = 0.13218338
Iteration 5, loss = 0.12551630
Iteration 6, loss = 0.12082296
Iteration 7, loss = 0.11737090
Iteration 8, loss = 0.11455702
Iteration 9, loss = 0.11229796
Iteration 10, loss = 0.11023656
Iteration 11, loss = 0.10870136
Iteration 12, loss = 0.10716509
Iteration 13, loss = 0.10572778
Iteration 14, loss = 0.10488414
Iteration 15, loss = 0.10388366
Iteration 16, loss = 0.10323305
Iteration 17, loss = 0.10256144
Iteration 18, loss = 0.10144029
Iteration 19, loss = 0.10101789
Iteration 20, loss = 0.10047289
Iteration 21, loss = 0.09984385
Iteration 22, loss = 0.09951395
Iteration 23, loss = 0.09902306
Iteration 24, loss = 0.09850717
Iteration 25, loss = 0.09825270
Iteration 26, loss = 0.09776830
Iteration 27, loss = 0.09752156
Iteration 28, loss = 0.09708081
Iteration 29, loss = 0.09685794
Iteration 30, loss = 0.09667160
Iteration 31, loss = 0.09602946
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=1, warm_start=False)

In [13]:
predictions = mlp.predict(X_test)

In [14]:
print(confusion_matrix(y_test,predictions))

[[  9481   6677]
 [  4114 134249]]


In [15]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.70      0.59      0.64     16158
           1       0.95      0.97      0.96    138363

    accuracy                           0.93    154521
   macro avg       0.83      0.78      0.80    154521
weighted avg       0.93      0.93      0.93    154521



In [None]:
mglearn.plots.plot

In [2]:
import pandas as pd

In [3]:
# read in new cat values
df = pd.read_csv('mimic-iii-clinical-database-1.4/DIAG_ROWS.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df[['DISEASES AND INJURIES', 'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS']] = df[['DISEASES AND INJURIES', 'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS']].astype('str')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

Unnamed: 0,LOS,ICUSTAY_ID,DISEASES AND INJURIES,"SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS",SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES,SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING
7,7.1314,219649,389.0,78552.0,V667,E8889
8,7.1314,219649,389.0,78552.0,V1582,E8889
9,7.1314,219649,51881.0,78552.0,V667,E8889
10,7.1314,219649,51881.0,78552.0,V1582,E8889
11,7.1314,219649,5849.0,78552.0,V667,E8889
12,7.1314,219649,5849.0,78552.0,V1582,E8889
13,7.1314,219649,2869.0,78552.0,V667,E8889
14,7.1314,219649,2869.0,78552.0,V1582,E8889
15,7.1314,219649,85221.0,78552.0,V667,E8889
16,7.1314,219649,85221.0,78552.0,V1582,E8889


In [4]:
cat_values = ['DISEASES AND INJURIES', 
             'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS', 
             'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 
             'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES']

target = 'LOS'

In [None]:
# encoding
df = pd.read_csv('ENC.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

In [5]:
# give every category a number
# df['ADMISSION_TYPE'] = df['ADMISSION_TYPE'].astype('category').cat.codes

for cat in cat_values:
    # give each category a one-hot-encoded vector
    df = df.join(pd.get_dummies(df[cat], prefix=cat + "_"))
    df = df.drop(cat, axis = 1)

In [6]:
# remove columns
#df = df.drop(cat_values, axis = 1)

df.set_index('ICUSTAY_ID', inplace=True)

In [7]:
df

Unnamed: 0_level_0,LOS,DISEASES AND INJURIES__1085.0,DISEASES AND INJURIES__1100.0,DISEASES AND INJURIES__1101.0,DISEASES AND INJURIES__1103.0,DISEASES AND INJURIES__1104.0,DISEASES AND INJURIES__1105.0,DISEASES AND INJURIES__1108.0,DISEASES AND INJURIES__1110.0,DISEASES AND INJURIES__1119.0,...,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V860,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V861,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8709,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V872,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8741,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8801,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8812,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V8821,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V902,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES__V9089
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219649,7.1314,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
def maplos(x):
    if x < 1.0: # short stay
        return 0
    elif x < 200.0: # long stay
        return 1

df['LOS'] = df['LOS'].map(lambda x: maplos(x))

In [None]:
# shuffle

## pandas can shuffle to by index, but sentdex says thats ugly
df = sklearn.utils.shuffle(df)

df.head()

In [9]:
y = df[target].values
X = df.drop(target, axis=1).values

In [None]:
pd.DataFrame(X)

In [None]:
%%time
# 17,38 min
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
y = y.astype('int')