# Stacking

Training D1_D2 dataset using the `Stacking` ensemble method.

### Author
Ajinkya Indulkar

In [1]:
# import libraries
import warnings

warnings.filterwarnings(action='ignore')

# data loading
import pandas as pd
import numpy as np

# data pre-processing
from sklearn.preprocessing import LabelEncoder
from pyts.preprocessing import InterpolationImputer

# model training
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# set seed (for reproducibility)
np.random.seed(43)

Helper Functions

In [2]:
def clean_dataset(df):
    """
    @description: clean data (imputation by interpolation 
    used to handle missing data)
    @arguments:
        df (pd.Dataframe) - raw dataset
    @returns:
        df (pd.Dataframe) - cleaned dataset
    """
    # list of columns for training (starting point)
    getting_started = ["RID", "VISCODE", "DX_bl", "DX", "ADAS13", "Ventricles", "CDRSB", "ADAS11", "MMSE", 
                       "RAVLT_immediate", "Hippocampus", "WholeBrain", "Entorhinal", "MidTemp", "FDG", "AV45",
                       "ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17", "APOE4", 
                       "AGE"]
    # filter dataframe based on list of columns selected
    df = df[getting_started]
    
    # filter out only baseline visits
    df = df[df["VISCODE"] == "bl"].reset_index(drop=True).drop(columns='VISCODE')
    
    # fix "DX" column
    df['DX'] = df['DX'].fillna('nan')
    
    # remove rows with nan DX values
    df = df[df['DX'] != 'nan'].reset_index(drop=True)
    
    # fix "ABETA_UPENNBIOMK9_04_19_17, TAU_UPENNBIOMK9_04_19_17, PTAU_UPENNBIOMK9_04_19_17" column
    err_cols = ["ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17"]
    for c in err_cols:
        df[c] = df[c].apply(lambda x: None if x == ' ' or '<' in x or '>' in x else x)
        df[c] = df[c].astype(float)
    
    # convert "DX_bl" and "DX" to categorical values
    df[['DX_bl', 'DX']] = df[['DX_bl', 'DX']].apply(LabelEncoder().fit_transform)
    
    # handle missing data (imputation by interpolation)
    impcols = [c for c in df.columns.tolist() if c not in ['RID', 'DX_bl']]
    features = df.drop(columns=['RID', 'DX_bl']).to_numpy()
    features = InterpolationImputer().fit_transform(features)
    df[impcols] = pd.DataFrame(features)
    
    return df

### Data Loading + Pre-processing

In [3]:
ROOT = "./tadpole_challenge/"

In [4]:
# load csv
d1_d2_df = pd.read_csv(ROOT+"TADPOLE_D1_D2.csv")

In [5]:
# clean dataset
train_df = clean_dataset(d1_d2_df)

### Split data into Train and Test datasets

In [6]:
X = train_df.drop(columns=['RID', 'DX_bl']).to_numpy()
y = train_df['DX_bl'].to_numpy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Model Training

#### Train a Decision Tree model (base learner)

In [8]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [9]:
print('DT Classifier accuracy:', dtc.score(X_test, y_test))

DT Classifier accuracy: 0.7890173410404624


#### Train a Stacking model with DT as base learner

In [10]:
estimators = [
    ('dt', DecisionTreeClassifier()),
    ('svr', make_pipeline(StandardScaler(),
                              LinearSVC(random_state=42)))]

In [11]:
stk_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), verbose=2)

In [12]:
stk_clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s finished


StackingClassifier(cv=None,
                   estimators=[('dt',
                                DecisionTreeClassifier(ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort='deprecated',
                     

In [13]:
print('Stacking Classifier accuracy:', stk_clf.score(X_test, y_test))

Stacking Classifier accuracy: 0.8208092485549133
