In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.datasets import load_breast_cancer

In [None]:
cancer = load_breast_cancer()

## Make data myself

In [None]:
# or
df = pd.read_csv('breast-cancer-wisconsin-data.csv', nrows=1)
columns = df.columns.tolist()
cols_to_use = columns[:len(columns)-1]
df = pd.read_csv('breast-cancer-wisconsin-data.csv', usecols=cols_to_use, delimiter=',')
df

In [None]:
df.set_index('id', inplace=True)

In [None]:
df['diagnosis'] = df['diagnosis'].astype('category').cat.codes
df

In [None]:
cat_values =df.columns[1:]

In [None]:
for cat in cat_values:
    # give each category a one-hot-encoded vector
    df = df.join(pd.get_dummies(df[cat], prefix=cat + "_"))
    df = df.drop(cat, axis = 1)

In [None]:
df.head()

In [None]:
y = df['diagnosis'].values
X = df.drop('diagnosis', axis=1).values

## -- automade

In [None]:
cancer.keys()

In [None]:
# Print full description by running:
# print(cancer['DESCR'])
# 569 data points with 30 features
cancer['data'].shape

In [None]:
X = cancer['data']
y = cancer['target']

In [None]:
import pandas as pd
pd.DataFrame(y)

In [None]:
# This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.
# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

In [None]:
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30), verbose=1)

In [None]:
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
mglearn.plots.plot

In [None]:
import pandas as pd

In [None]:
# read in new cat values
df = pd.read_csv('mimic-iii-clinical-database-1.4/DIAG_ROWS.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df[['DISEASES AND INJURIES', 'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS']] = df[['DISEASES AND INJURIES', 'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS']].astype('str')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

In [None]:
cat_values = ['DISEASES AND INJURIES', 
             'SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS', 
             'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 
             'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES']

target = 'LOS'

In [None]:
# encoding
df = pd.read_csv('ENC.csv')
df_loc = pd.read_csv('mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz')
df_loc = df_loc[['HADM_ID', 'LOS', 'ICUSTAY_ID']]

df = pd.merge(df_loc, df, on='HADM_ID', how='left')
df = df.drop('HADM_ID', axis=1)
df = df.dropna()

df

In [None]:
# give every category a number
# df['ADMISSION_TYPE'] = df['ADMISSION_TYPE'].astype('category').cat.codes

for cat in cat_values:
    # give each category a one-hot-encoded vector
    df = df.join(pd.get_dummies(df[cat], prefix=cat + "_"))
    df = df.drop(cat, axis = 1)

In [None]:
# remove columns
#df = df.drop(cat_values, axis = 1)

df.set_index('ICUSTAY_ID', inplace=True)

In [None]:
df

In [None]:
def maplos(x):
    if x < 1.0: # short stay
        return 0
    elif x < 200.0: # long stay
        return 1

df['LOS'] = df['LOS'].map(lambda x: maplos(x))

In [None]:
# shuffle

## pandas can shuffle to by index, but sentdex says thats ugly
df = sklearn.utils.shuffle(df)

df.head()

In [None]:
y = df[target].values
X = df.drop(target, axis=1).values

In [None]:
pd.DataFrame(X)

In [None]:
%%time
# 17,38 min
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
y = y.astype('int')