In [5]:
import os
import sys
sys.path.insert(0, "..")
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from scipy import stats
from IPython.display import display
from sklearn.model_selection import GridSearchCV
from evaluation import *
from sklearn.metrics import *


from paths import *

## Data
Here is the initial data, pulled from the original csv.

In [6]:
display(pd.read_csv(data_dir / "stroke_all.csv"))

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,5450,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,28375,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,27973,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


## Split the data

We split the data into training and test sets. We save these in "stroke_train .csv" and "stroke_test.csv".

In [7]:
def split_train_test(data_file, train_frac=0.66, random_seed=1):
    # set the random seed
    np.random.seed(random_seed)

    # read the data
    data = pd.read_csv(data_file)

    # select instances for the train set
    n_all = data.shape[0]
    mask = np.random.rand(n_all) < train_frac

    # filter the data to train and test set
    train = data.iloc[mask]
    test = data.iloc[~mask]

    return train, test

In [8]:
def generate_train_test():
    data_file = data_dir / "stroke_all.csv"
    train_file = data_dir / "stroke_train.csv"
    test_file = data_dir / "stroke_test.csv"

    # split data into train and test
    train, test = split_train_test(data_file)

    # save the split data
    train.to_csv(train_file, index=False)
    test.to_csv(test_file, index=False)

In [9]:
generate_train_test()

## Imputation

Here we have code to impute a column (with nans) using the rest of the columns (with no nans) with the following methods:
#### Categorical
- mode
- logistic

#### Numerical
- linear 
- mean
- zero

## Normalization
We also have two normalization techniques that we used.
- mean-variance
- min-max


In [10]:
def impute(X, target, method="linear"):
    """
    Imputes target feature (with missing values) using all other features (no missing values) using the selected method

    :param X: (n x p-1 numpy array) features to use for imputation (no missing vals)
    :param target: (n x 1 numpy array) feature to impute
    :param method: (str) One of "linear", "logistic", "mean", "zero", "mode"
    :return: (n x 1 numpy array) target array with all values imputed
    """
    nan_values = pd.isnull(target)

    if method == "zero":
        target[nan_values] = 0

    elif method == "mean":
        target[nan_values] = np.mean(target[~nan_values])

    elif method == 'mode':
        target[nan_values] = stats.mode(target[~nan_values])[0]

    elif method == "linear":
        regression = LinearRegression()
        regression.fit(X[~nan_values], target[~nan_values])
        target[nan_values] = regression.predict(X[nan_values])

    elif method == "logistic":
        regression = LogisticRegression(multi_class='multinomial',solver='newton-cg')
        regression.fit(X[~nan_values], target[~nan_values])
        target[nan_values] = regression.predict(X[nan_values])

    else:
        raise ValueError('Please choose one of "linear", "logistic", "mean", "zero" for choice of method')

    return target


def normalize(X, method="meanvar"):
    """
    Normalizes the features of X. If "meanvar", makes mean 0 and variance 1. If "minmax" makes min 0 and max 1.

    :param X: (n x p numpy array) features to normalize (no missing vals)
    :param method: (str) One of "meanvar", "minmax"
    :return:  (n x p) numpy array with normalized features
    """

    if method == "meanvar":
        normalized_X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

    elif method == "minmax":
        normalized_X = (X - np.amin(X, axis=0))/np.amax(X, axis=0)

    else:
        raise ValueError('Please choose one of "meanvar", "minmax" for choice of method')

    return normalized_X

## Preprocessing
Here we preprocess the data. This includes the following steps:
- One-hot-encoding all the catageorical data with no nans
- Collecting the one-hot-encoded and numerical columns and using them to impute BMI (numerical) with linear regression and smoking staus (categorical) with logistic regression
- One-hot-encode smoking status
- Collecting all the columns and applying a mean-variance normalization

In [11]:
def preprocessing(X, normalization="meanvar", numerical_imputation='linear',
                      categorical_imputation='logistic'):
    """
    :param X: (n x p numpy array)
    :param normalization: (str) One of "meanvar", "minmax"
    :param numerical_imputation: (str) One of "linear", "mean", "zero"
    :param categorical_imputation: (str) One of "mode", "logistic"
    :return:
    """

    numerical_columns = ['age','hypertension','heart_disease','avg_glucose_level']

    X_to_enc = X.drop(numerical_columns + ['bmi','smoking_status'], axis=1)

    enc = OneHotEncoder(sparse=False)
    X_out = enc.fit_transform(X_to_enc)

    X_out = np.append(X_out, X[numerical_columns], axis=1)

    bmi = impute(X_out, X['bmi'], method=numerical_imputation).to_numpy().reshape(-1,1)

    smoking_raw = impute(X_out, X['smoking_status'], method=categorical_imputation).to_numpy().reshape(-1,1)

    enc_smoking = OneHotEncoder(sparse=False)
    smoking = enc_smoking.fit_transform(smoking_raw)

    X_out = np.append(X_out, bmi, axis=1)
    X_out = np.append(X_out, smoking, axis=1)

    return normalize(X_out, method=normalization)


def get_preprocessed_data(normalization="meanvar", numerical_imputation='linear',
                      categorical_imputation='logistic', write=False, datafile="stroke_all.csv"):
    """
    :param normalization: (str) One of "meanvar", "minmax"
    :param numerical_imputation: (str) One of "linear", "mean", "zero"
    :param categorical_imputation: (str) One of "mode", "logistic"
    :param write: (Bool) Whether to write to a file
    :param datafile: (str) file name to retrieve data
    :return:
    """

    data = pd.read_csv(data_dir / datafile)
    
    if 'id' in data.columns:
        data = data.drop('id', axis=1)

    X, y = data.iloc[:, 1:-1], data.iloc[:, -1].to_numpy().reshape(-1,1)
    X = preprocessing(X, normalization=normalization, numerical_imputation=numerical_imputation,
                      categorical_imputation=categorical_imputation)

    if write:
        outfile = str(datafile)[:-4] + '_preprocessed.csv'
        pd.DataFrame(np.append(X, y, axis=1)).to_csv(data_dir / outfile)

    return(X, y)

In [12]:
X, y = get_preprocessed_data(datafile="stroke_all.csv")
X_train, y_train = get_preprocessed_data(datafile="stroke_train.csv")
X_test, y_test = get_preprocessed_data(datafile="stroke_test.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

## Preprocessed Data
Here is a display of the full dataset after preprocessing.

In [13]:
display(pd.DataFrame(np.append(X,y,axis=1)))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.344203,-1.344203,-0.378561,-0.063993,-1.156549,-0.430773,2.459681,1.002584,-1.002584,-1.741517,-0.321296,-0.223342,-0.217176,-1.390766,-0.467021,0.702212,-0.422056,0.0
1,-0.743935,0.743935,-0.378561,-0.063993,0.864641,-0.430773,-0.406557,-0.997423,0.997423,0.700823,3.112398,-0.223342,-0.383258,1.373681,-0.467021,0.702212,-0.422056,0.0
2,1.344203,-1.344203,-0.378561,-0.063993,0.864641,-0.430773,-0.406557,-0.997423,0.997423,-1.519486,-0.321296,-0.223342,0.148621,-1.442925,-0.467021,0.702212,-0.422056,0.0
3,-0.743935,0.743935,-0.378561,-0.063993,0.864641,-0.430773,-0.406557,1.002584,-1.002584,1.233697,-0.321296,-0.223342,-0.822123,0.943366,2.141230,-1.424071,-0.422056,0.0
4,1.344203,-1.344203,-0.378561,15.626828,-1.156549,-0.430773,-0.406557,1.002584,-1.002584,-1.253049,-0.321296,-0.223342,1.317458,-1.247328,-0.467021,0.702212,-0.422056,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43395,1.344203,-1.344203,-0.378561,-0.063993,-1.156549,-0.430773,2.459681,-0.997423,0.997423,-1.430674,-0.321296,-0.223342,-1.063359,-1.077810,-0.467021,0.702212,-0.422056,0.0
43396,-0.743935,0.743935,2.641579,-0.063993,-1.156549,-0.430773,-0.406557,-0.997423,0.997423,0.612011,-0.321296,-0.223342,2.531294,3.486136,2.141230,-1.424071,-0.422056,0.0
43397,-0.743935,0.743935,-0.378561,-0.063993,0.864641,-0.430773,-0.406557,-0.997423,0.997423,1.766571,3.112398,-0.223342,-0.290939,0.030577,2.141230,-1.424071,-0.422056,0.0
43398,-0.743935,0.743935,-0.378561,-0.063993,0.864641,-0.430773,-0.406557,-0.997423,0.997423,-0.098488,-0.321296,-0.223342,-0.123465,0.591291,-0.467021,0.702212,-0.422056,0.0


## Analysis

An analysis of how a Logistic Regression Classifier (with CV hyperparameter search) performs using each combination of preprocessing techniques. A chart with the resultant metrics can be found below.

In [14]:
def test_preprocessing():
    results = {}

    for normalization in ["meanvar", "minmax"]:
        for numerical_imputation in ['linear', 'mean', 'zero']:
            for categorical_imputation in ['mode', 'logistic']:
                X_train, y_train = get_preprocessed_data(normalization=normalization, numerical_imputation=numerical_imputation,
                                      categorical_imputation=categorical_imputation, datafile="stroke_train.csv")

                X_test, y_test = get_preprocessed_data(normalization=normalization, numerical_imputation=numerical_imputation,
                                      categorical_imputation=categorical_imputation, datafile="stroke_test.csv")

                cross_validation_args = {"verbose": 1,  # print to console
                                         "n_jobs": -1,  # use parallelization
                                         "cv": 3,
                                         # number CV folds (needs to be small to compute exhaustive hyperparameter search)
                                         "scoring": make_scorer(roc_auc_score, needs_proba=True),
                                         # choose best hyperparams using AUC
                                         }

                #### Logistic Regression
                lr = GridSearchCV(LogisticRegression(),
                                  param_grid={"max_iter": [500],
                                              "penalty": ['elasticnet'],
                                              "solver": ["saga"],
                                              "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
                                              "class_weight": [{1: 5}, {1: 10}, {1: 20}, {1: 50}, {1: 100}]},
                                  **cross_validation_args)
                lr.fit(X_train, y_train)
                y_pred = lr.predict(X_test)
                results[(normalization , numerical_imputation,categorical_imputation)] = evaluate(y_test, y_pred, pos_label=1)
    return results

In [None]:
results = test_preprocessing()
display(pd.DataFrame(results))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   32.3s finished
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   31.6s finished
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/ind

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   42.4s finished
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/ind

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   41.0s finished
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   43.7s finished
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   40.2s finished
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[Parallel(n_jo

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.3s
