In [None]:
"""
CS109a, Fall 2017
Utility functions for the final project.
"""
import pandas as pd
import numpy as np


def standardize(x_1, x_2, cont_cols):
    """
    Use this function to standardize our dataset(s).
    @param x_1: The DataFrame to be used for standardizing.
                Typically this will be the training dataset
    @param x_2: The DataFrame to standardize using the metrics
                calculated off of X_1. This will be either the
                the test dataset or the training dataset.
    @param cont_cols: A list of the continuous columns (i.e the
                      columns that we want to standardize.)
    """

    # Continuous data that needs to be standardized
    df_cont = x_1[cont_cols]

    # Save the mean/std so we can use these values to normalize
    # the test data below.
    df_mean = df_cont.mean()
    df_std = df_cont.std()

    x_2[cont_cols] = (x_2[cont_cols] - df_mean) / df_std

    return x_2


def get_data():
    """
    Use this function to get the data needed for the project.
    :return: Project data set
    """
    # The columns we want to consider
    cols = [
        # Roster ID
        'RID',

        # Visit Code
        'VISCODE',

        # Gender
        'PTGENDER',

        # APOE 4 Gene
        'APOE4',

        # Race
        'PTRACCAT',

        # Age at Baseline
        'AGE',

        # Brain Scan Stuff
        'Hippocampus_bl',
        'Ventricles_bl',
        'WholeBrain_bl',
        'Entorhinal_bl',
        'Fusiform_bl',
        'MidTemp_bl',
        'ICV_bl',

        # Diagnosis
        'DX',
        
        # Month since baseline
        'Month'
    ]

    categorical_cols = ['PTGENDER','APOE4','PTRACCAT']


    # Read the ADNIMerge file
    df_am = pd.read_csv('data/ADNIMerge.csv')
    df_am = df_am[cols]

    # Merge in the response variable
    df_y = get_response()
    df_am = df_am.merge(df_y, on="RID")
    
    # Drop the DX column "y" is the response col now
    df_am = df_am.drop("DX", axis=1)
    
    # Also drop the Month
    df_am = df_am.drop("Month", axis=1)

    # Drop RID
    # TODO: This is commented out until the final dx is merged in
    # df_bl = df_bl.drop('RID', axis=1)

    # Drop records with NaN
    df_am = df_am.dropna()

    # From our EDA we know that we need to log transform the
    # Ventricles measurement.
    df_am['Ventricles_bl'] = df_am['Ventricles_bl'].apply(np.log)

    # One-Hot Encode data
    df_am = pd.get_dummies(df_am,
                           prefix=categorical_cols,
                           columns=categorical_cols,
                           drop_first=True)

    return df_am


def get_response():
    """
    return dataframe with cols: [RID, response]
    response is mapped to:
    # 0 = CN
    # 1 = MCI
    # 2 = Dementia
    At the time of their last visit determined by max(months)
    """
    df_am = pd.read_csv("./data/ADNIMERGE.csv")
    # there are some rows with no "DX" let's drop those
    df_am = df_am[df_am.DX.notnull()]
    # let's assign them each a number
    df_response = pd.DataFrame([[0, "CN"], [1, "MCI"], [2, "Dementia"]], columns=["y", "DX"])
    df_am = df_am.merge(df_response, on="DX")
    # only need a few columns for now
    df_am = df_am[["RID", "Month", "y"]]
    # group by patient
    by_patient = df_am.groupby("RID")
    
    value_at_max_month = []
    for k, v in by_patient:
        max_mo = v["Month"].max()
        y_at_max_mo = v[v["Month"] == max_mo].y.values[0]
        value_at_max_month.append([k, y_at_max_mo])

    df_mm = pd.DataFrame(value_at_max_month, columns=["RID", "y"])
    
    return df_mm

def test_train_split():
    data = get_data()

    label = 'y'

    continuous_cols = ['AGE',
                       'Hippocampus_bl',
                       'Ventricles_bl',
                       'WholeBrain_bl',
                       'Entorhinal_bl',
                       'Fusiform_bl',
                       'MidTemp_bl',
                       'ICV_bl']

    # We don't need RID or VISCODE anymore
    data_clean = data.drop(['RID', 'VISCODE'], axis=1)

    # Prior to splitting, do a quick randomizing shuffle
    data_clean = data_clean.sample(frac=1, random_state=17).reset_index(drop=True)

    # Index for test/train split.
    idx = int(data_clean.shape[0]*.8)

    # Split the data into training and test data sets.
    data_train = data_clean.iloc[:idx]
    data_test = data_clean.iloc[idx:]

    # Create our test/training DataFrames
    X_train = data_train.drop(label, axis=1)
    y_train = data_train[[label]].values.ravel()

    X_test = data_test.drop(label, axis=1)
    y_test = data_test[[label]].values.ravel()

    # Standardize the data
    X_test = standardize(X_train, X_test, continuous_cols)
    X_train = standardize(X_train, X_train, continuous_cols)

return X_train, y_train, X_test, y_test