In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Read in the data

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)

diabetes_df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
diabetes_df.head()

## 2. Explore and vizualize the data

In [None]:
diabetes_df.info()

In [None]:
diabetes_df.isna().sum()

In [None]:
diabetes_df.describe()

In [None]:
diabetes_df.duplicated().sum()

- There are many columns that contain various 0 values; these are likely Na values and should be replaced
- Pregnancies and Outcome with values of zero should be kept

In [None]:
replace_cols = [col for col in diabetes_df.columns if not(col == 'Outcome' or col == 'Pregnancies')]

diabetes_df[replace_cols] = diabetes_df[replace_cols].replace({0: np.nan})

diabetes_df.isna().sum()

In [None]:
# create a version of the df with no NaN for testing use (**not to be used for real training**)
# just used to test pipeline functions on some non NaN data
testing_df = diabetes_df.dropna()
testing_y = testing_df.pop('Outcome')
testing_df.isna().sum()
testing_y

- Split up the data in order to avoid leakage
- Only use df_train and y_train for training the model
- The testing data should not be looked at so it will be left aside until the final model is created (to test the final model)

In [None]:
from sklearn.model_selection import train_test_split
y_outcome = diabetes_df.pop('Outcome')
df_train, df_test, y_train, y_test = train_test_split(diabetes_df, y_outcome, test_size=0.2, stratify=y_outcome, random_state=8)

In [None]:
y_train.hist()

- Evident from the graph, the training data contains more non diabetics than diabetics

In [None]:
df_train.hist(figsize=(10, 10))

- Factors that are commonly known to be associated with diabetes occur in adults who have type 2 diabetes
- Type 1 diabetes may occur in kids and the causes are still unknown
- Adults with a certain combinations of factors (like obesity or family history) are more likely to develop T2D
- Family history also affects T1D
- The diabetes pedigree function takes family history into account

In [None]:
import seaborn as sns
from typing import Tuple
def kde_with_log(x: str, df: pd.DataFrame = df_train) -> Tuple[sns.kdeplot]:
    """Make a kde graph with the logged version of the graph beside
    """
    fig, axs = plt.subplots(1, 2, figsize=(8, 4))
    ax1 = sns.kdeplot(data=df_train, x=x, ax=axs[0])
    ax1.set_title(f'{x} Frequency')

    if df_train[x].min() != 0:
        # apply normal log if no values are less than 0
        log_x = df_train[x].apply(np.log)
    else:
        # apply shifted log if some values equal zero
        log_x = df_train[x].apply(np.log1p)
    ax2 = sns.kdeplot(x=log_x, ax=axs[1])
    ax2.set_title(f'Logged {x} Frequency')
    
    return (ax1, ax2)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# check all logged relationships to see which right skewed distributions may be more informative with a log transformation
for col in df_train.columns:
    ax1, ax2 = kde_with_log(col)

In [None]:
def numerical_v_outcome(x, y, ax, graph_type='violin', title=None, df=df_train):
    """Graph all features vs the outcome. If outcome (x) is None, simply graph features
    """
    if title == None:
        title = f'{y} vs Outcome'
    if type(y) == str:
        sns_y = df[y]
    else:
        sns_y = y
        
    if graph_type == "violin":
        ax = sns.violinplot(x=x, y=sns_y, ax=ax)
    else:
        ax = sns.boxplot(x=x, y=sns_y, ax=ax)
    ax.set_title(title)
    return ax

In [None]:
# plot various factors vs the outcome with a violin plot to see the frequency of distribution
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
index_1 = 0
index_2 = 0
fig.suptitle('Violin plots: Outcome vs Features')

for i in range(len(df_train.columns)):
#     if diabetes_df.columns[i] != "Outcome":
    numerical_v_outcome(y_train, df_train.columns[i], axes[index_1, index_2])
#     else:
#         fig.delaxes(axes[index_1, index_2])
    index_1 += 1
    if index_1 == 3:
        index_1 = 0
        index_2 += 1
fig.delaxes(axes[index_1, index_2])

In [None]:
# plot various factors vs the outcome showing the points to spot outliers
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
index_1 = 0
index_2 = 0
fig.suptitle('Boxplots: Outcome vs Features')

for i in range(len(df_train.columns)):
#     if diabetes_df.columns[i] != "Outcome":
    numerical_v_outcome(y_train, df_train.columns[i], axes[index_1, index_2], graph_type='boxplot')
#     else:
#         fig.delaxes(axes[index_1, index_2])
    index_1 += 1
    if index_1 == 3:
        index_1 = 0
        index_2 += 1
fig.delaxes(axes[index_1, index_2])

In [None]:
# plot various logged factors vs the outcome to show how normalizinf the data works
# note, some of these graphs are repeated from the large graphs before
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
index_1 = 0
index_2 = 0
fig.suptitle('Boxplots: Outcome vs Features')

for i in range(len(df_train.columns)):
    if df_train[df_train.columns[i]].min() > 0:
        numerical_v_outcome(y_train, df_train[df_train.columns[i]].apply(np.log), axes[index_1, index_2], graph_type='boxplot', title=f'Log {df_train.columns[i]} vs Outcome Boxplot')
    else:
        numerical_v_outcome(y_train, df_train[df_train.columns[i]].apply(np.log1p), axes[index_1, index_2], graph_type='boxplot', title=f'Log {df_train.columns[i]} vs Outcome Boxplot')
    index_1 += 1
    if index_1 == 3:
        index_1 = 0
        index_2 += 1
fig.delaxes(axes[index_1, index_2])

- By comparing the graphs, it is evident that blood glucose, age, and BMI mark visible differences
- The diabetes pedigree function is surprisingly not as useful as I would have expected it to be
- 3 features seemed to be normalized better using a log transformation: consider replacing the features by the logged versions. The 3 features were: DiabetesPedigreeFunction, Insulin, and Age. The pregnency feature is also well normalized with a log transformation. 
- Many of the graphs also have outliers in the upper ranges so it would be good to scale those values into more normal ranges to not overfit the model

In [None]:
# explore insulin vs bmi
ax = sns.scatterplot(x=df_train['Insulin'], y=df_train['BMI'])

In [None]:
# plot various factors vs the outcome showing the points to spot outliers
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
index_1 = 0
index_2 = 0
fig.suptitle('Boxplots: Outcome vs Features')

for i in range(len(df_train.columns)):
#     if diabetes_df.columns[i] != "Outcome":
    numerical_v_outcome(None, df_train.columns[i], axes[index_1, index_2], graph_type='boxplot', title=f'{df_train.columns[i]}')
#     else:
#         fig.delaxes(axes[index_1, index_2])
    index_1 += 1
    if index_1 == 3:
        index_1 = 0
        index_2 += 1
fig.delaxes(axes[index_1, index_2])

## 3. Basic model
- The data has NaN values and has not been split up
- Create a basic pipeline which can deal with imputing and applies an XGBClassifier
- Use cross validation on the classifier to ensure accurate scores; call a scoring function with the initial train data and basic model to get a baseline score

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        # store the attribute X_ to see scaled data in this step
        self.X_ = X
        return X
        
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

basic_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('clean_data', Debug()),
    ('model', DecisionTreeClassifier(random_state=0))
])
basic_pipeline

In [None]:
def get_median_imputed_data(df):
    median_imputer = SimpleImputer(strategy='median')
    imputed_df = pd.DataFrame(median_imputer.fit_transform(df), columns=df.columns)
    return imputed_df

## 4. Baseline Scoring
- With data now being clean, make a function which can score a model based on particular data
- Call this function with the initial train data and basic model to get a baseline score

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_validate
from typing import Dict


def score_pipeline_model(X: pd.DataFrame, y: pd.Series, pipeline: Pipeline, tuning: bool = False) -> Dict[str, int]:
    """
    Score a model that is in a pipeline. Only look at F1 score if hyperparameter tuning is occuring
    """
    if not tuning:
        # add multiple socring methods to avoid running this process multiple times
        scoring = {
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall',
            'f1': 'f1',
            'roc_auc': 'roc_auc'
        }

        scores = cross_validate(pipeline, X, y, scoring=scoring, cv=5)

        scoring_dict = {}
        for score in scores.keys():
            if score.startswith('test_'):
    #             print(scores[score])
                scoring_dict[score] = scores[score].mean()
        return scoring_dict
    #     return scores.mean()
    else:
        scores = cross_val_score(pipeline, X, y, scoring='f1', cv=5)
        return scores.mean()

In [None]:
from sklearn.metrics import confusion_matrix

def get_preds(X: pd.DataFrame, y: pd.Series, pipeline: Pipeline,
                            X_test: pd.DataFrame = None, y_test: pd.DataFrame = None) -> Tuple[pd.Series, np.ndarray]:
    """Fit a pipeline and create predictions 
    
    Args:
        X: Either the entire dataframe or the training dataframe (if X_test is also passed)
        y: Either the entire predictions or the training predictions (if y_test is also passed)
        pipeline: The pipeline to fit the data on
        X_test: Df containing test data
        y_test: Series containing test outcomes
    
    Returns:
        A tuple containing the true outcomes and the predictions
    """
    if X_test == None and y_test == None:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=88)
    else:
        X_train, y_train = X, y
    
    pipeline.fit(X_train, y_train)
    
    preds = pipeline.predict(X_test)
    
    return (y_test, preds)

In [None]:
basic_pipeline_scoring = score_pipeline_model(df_train, y_train, basic_pipeline)
print(basic_pipeline_scoring)

In [None]:
outcome, basic_predictions = get_preds(df_train, y_train, basic_pipeline)
print(classification_report(outcome, basic_predictions))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(outcome, basic_predictions)
ax = sns.heatmap(cm, annot=True)

- Initial model has accuracy of about 71%
- Use feature engineering to improve the performance

## 5. Feature Engineering
- Apply feature engineering (to just training data for the initial testing but whole data set on full run)
- Apply Kmeans for grouping in clusters as well as distance to a certain cluster
- Apply PCA to explore loadings of Principal Components (PC)
- Given loadings of PC, apply mathematical transformations and interactions between data (ratios, sums, diff, etc)
- Consider applying log transformations to certain skewed features
- Make counts (XGB is a tree based model which cannot aggregate well across multiple columns). Do counts for above and below the respective medians
- Flag or move outliers into the interquartile range (IQR). Some data outside of this range my not be outliers - do not just remove extreme data.
- Create bins for numerical columns to avoid over fitting
- Make sure all transformations are in functions (so that function transformers can be applied to the pipeline)
- Make a pipeline containing the transformations and try out different combinations to see what gives the best results

In [None]:
from typing import List
def _set_up_kmeans(df: pd.DataFrame, features: List[int]) -> pd.DataFrame:
    """Private function to normalize data for kmeans
    
    Args:
        df: Current dataframe being used
        features: List of features for kmeans to be applied on
    
    Returns:
        Dataframe for specific features with scaled data
    """
    df_copy = df.copy()
    df_selected = df_copy.loc[:, features]
    df_scaled = (df_selected - df_selected.mean(axis=0)) / df_selected.std(axis=0)
    return df_scaled
    

In [None]:
from sklearn.cluster import KMeans

def kmeans_cluster(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
    """Creates cluster features using Kmeans on certain features
    
    Args:
        df: Current dataframe being used
        n_clusters: The number of clusters to create using Kmeans
        
    Returns:
        The same dataframe with an additional clusters feature
    """
    features = ['BMI', 'Insulin', 'Glucose', 'Age', 'Pregnancies']
    X_scaled = _set_up_kmeans(df, features)
  
    k_means = KMeans(n_clusters, n_init=50, max_iter=1000, random_state=42)
    
    # make a new column in the dataframe for each cluster
    df.loc[:, 'Cluster'] = k_means.fit_predict(X_scaled)
    return df  

In [None]:
def kmeans_cluster_dist(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
    """Creates cluster distance features for kmeans on certain df features
    
    Args:
        df: Current dataframe being used
        n_clusters: The number of clusters to create using Kmeans
        
    Returns:
        The same dataframe with an additional cluster distances features
    """
    features = ['BMI', 'Insulin', 'Glucose', 'Age', 'Pregnancies']
    X_scaled = _set_up_kmeans(df, features)
    
    k_means = KMeans(n_clusters, n_init=50, max_iter=1000, random_state=41)
    cluster_distances = k_means.fit_transform(X_scaled)
    
    # make a new df for the cluster distances and join it to the old df
    cluster_dist_df = pd.DataFrame(cluster_distances, columns=[f'centeroid_dist_{i}' for i in range(cluster_distances.shape[1])], index=df.index)
#     print(cluster_dist_df)
    df = df.join(cluster_dist_df)
    return df

- Make a function to calculate MI scores
- Test on some data and use it on PCA to check usefulness of principal components (PCs)

In [None]:
from sklearn.feature_selection import mutual_info_regression
def get_mi_scores(df, y):
    df = df.copy()
    # label encode categorical data
    for col in df.select_dtypes(['object', 'category']):
        df_copy[col], _ = df[col].factorize()
    discrete = [pd.api.types.is_integer_dtype(col_type) for col_type in df.dtypes]
    mi_scores = pd.Series(mutual_info_regression(df, y, discrete_features=discrete, random_state=42), index=df.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
# test mutual index with the non zero features
print(get_mi_scores(testing_df, testing_y))

- The featues between glucose and BMI seem to have the most effect. This is seen in the EDA and in the mi scores for the data
- Use these 5 features for PCA and Kmeans

In [None]:
from sklearn.decomposition import PCA
from typing import Tuple
def _apply_pca(df: pd.DataFrame, features: List[str]) -> Tuple[np.ndarray, PCA, List[str]]:
    """Private funciton to apply pca on certain features in a df
    
    Args:
        df: Current dataframe being used
        features: List of features for pca to be applied on
    
    Returns:
        Numpy array containing the principal components for specific features, trained PCA model, and column names for the PCs
    """
    df = df[features].copy()
    df = (df - df.mean(axis=0)) / df.std(axis=0)
    # create architecture
    pca = PCA(random_state=42)
    pc = pca.fit_transform(df)
    
    # get column names
    new_cols = [f'pc_{i+1}' for i in range(len(features))]
    return (pc, pca, new_cols)

In [None]:
def create_pc_pca(df: pd.DataFrame, feature_eng: bool = True) -> pd.DataFrame:
    """Creates principal components after PCA is applied
    
    Args:
        df: The current dataframe being used
        feature_eng: If this is the feature engineering iteration (false would mean training in pipeline)
    
    Returns:
        A new dataframe that contains the relevant principal components as features
    """
    
#     if feature_eng:
#         pc_y = testing_y
#     else:
#         pc_y = y_train
    df = df.copy()
    features = ['BMI', 'Insulin', 'Glucose', 'Age', 'Pregnancies']
    pc, pca, new_cols = _apply_pca(df, features)
    pc_df = pd.DataFrame(pc, columns=new_cols, index=df.index)
    
    # only take good PC 
#     scores_array = get_mi_scores(pc_df, pc_y)
    evr = pca.explained_variance_ratio_
    
    good_pc = []
    MI_CUTOFF = 0.06
    EVR_CUTOFF = 1 / len(features)
    for i in range(len(features)):
#         print(scores_array[i], evr[i])
#         if scores_array[i] > MI_CUTOFF and evr[i] > EVR_CUTOFF:
#         if evr[i] > EVR_CUTOFF:
        good_pc.append(f'pc_{i+1}')
    
    pc_df = pc_df[good_pc]
    df = df.join(pc_df)
    return df
     

In [None]:
# get an imputed version of the training data to check findings of PCA loadings
imputed_train_df = get_median_imputed_data(df_train)
imputed_train_df.isna().sum()

In [None]:
def get_pca_loadings(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    df = df.copy()
    _, pca, col_names = _apply_pca(df, features)
    # get the loadings for the principal components
    loadings = pd.DataFrame(pca.components_.T, columns=col_names, index=features)
    return loadings

In [None]:
features = ['BMI', 'Insulin', 'Glucose', 'Age', 'Pregnancies']
# get the loadings for the principal components
loadings = get_pca_loadings(imputed_train_df, features)
loadings

- PC 1 shows that there are people who have high insulin, glucose, age, and pregnancies. These can be added to make a new 'vulnerable' category
- PC 3 shows a potential grouping of low bmi and high insulin (ratio can be made)

In [None]:
# repeat the same process using only 4 features to see if there are any different results
features_2 = ['BMI', 'Insulin', 'Glucose', 'Age']
loadings_2 = get_pca_loadings(imputed_train_df, features_2)
loadings_2

- PC 2 in the above loadings show a contrast between age and bmi: low bmi and high age (may be expressed as a ratio)
- PC 4 in the above loadings and PC 3 in the below loadings show a contrast between glucose and insulin: low glucose and high insulin

In [None]:
# repeat one last time, this time without BMI
features_3 = ['Pregnancies', 'Insulin', 'Glucose', 'Age']
loadings_3 = get_pca_loadings(imputed_train_df, features_3)
loadings_3

In [None]:
def pca_loading_features(df: pd.DataFrame, ignore_features: List[str] = []) -> pd.DataFrame:
    """Create new features from the above analysis of PCA loadings
    
    Args:
        df: The current DataFrame being used
        ignore_feature: List of features to ignore from the function
        
    Returns:
        A DataFrame with the new pca features
    """
    df = df.copy()
    if 'glucose_and_age' not in ignore_features:
        df['glucose_and_age'] = df['Glucose'] * df['Age']
    
    if 'insulin_to_bmi' not in ignore_features:
        df['insulin_to_bmi'] = df['Insulin'] / df['BMI']
    
    if 'age_to_bmi' not in ignore_features:
        df['age_to_bmi'] = df['Age'] / df['BMI']
    
    if 'insulin_glucose_effect' not in ignore_features:
        df['insulin_glucose_effect'] = df['Insulin'] / df['Glucose']
    
    return df

In [None]:
def unskew_with_log(df: pd.DataFrame) -> pd.DataFrame:
    """Apply log transformations to normalize data discovered in EDA
    
    Args:
        df: The current dataframe
    
    Returns:
        A new df containing logged versions of certain features
    """
    df=df.copy()
    
    features=['DiabetesPedigreeFunction', 'Insulin', 'Age', 'Pregnancies']
    
    # loop through list of features and apply log transformations
    for feature in features:
        if df[feature].min() > 0:
            df[f'logged_{feature}'] = df[feature].apply(np.log)
#             df[feature] = df[feature].apply(np.log)
        else:
            df[f'logged_{feature}'] = df[feature].apply(np.log1p)
#             df[feature] = df[feature].apply(np.log1p)
    return df

In [None]:
def compare_median(row: pd.Series, medians: pd.Series) -> pd.Series:
    """Adds counts to above or below medians
    
    Args:
        row: A row in a dataframe (one person)
        medians: The medians for the current dataframe
    
    Returns:
        A new series containing the new count features
    """
    # get counts for features above the median
    row['greater_than_median'] = row.gt(medians).sum()
    # get counts for features equal to or less than - remove to not have repetitive features
#     row['less_than_median'] = row.le(medians).sum()
    return row

In [None]:
def row_median_counts(df: pd.DataFrame) -> pd.DataFrame:
    """Apply counts above and below the median for each feature in a row
    
    Each of the initial features are higher in diabetics (seen in eda), so doing a median count on per feature will reveal more at risk people
    
    Args:
        df: The current dataframe
    
    Returns:
        A new df containing a median count feature
    """
    initial_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
    
    df = df.copy()
    medians = df[initial_cols].apply(lambda col: col.median())
    df = df.apply(compare_median, axis=1, args=(medians,))
    return df

In [None]:
def remove_outliers(df: pd.DataFrame, features: List[str] = ['SkinThickness', 'Glucose', 'Pregnancies']) -> pd.DataFrame:
    """Remove outliers from certain features to avoid over fitting
    
    Upon examination of the data, 3 features in particular appear to have outliers. SkinThinkness and Pregnancies have
    clear outliers in the box and whisker plots; these should be removed. Glucose has outliers in the non diabetic grouping
    which should be moved into back down. This comes from my knowledge that glucose can be affected by what the person has 
    eaten recently - eating an ice cream recently will cause the person to have a glucose spike and cause outliers in the data.
    
    Args:
        df: The current dataframe
        features: A list of features to move inside of the IQR
    
    Returns:
        A dataframe with the outliers removed
    """
    df = df.copy()
    for feature in features:
        # calculate the interquartile range (IQR) for a certain feature; where the bulk of the data is
        quart_1 = df[feature].quantile(0.25)
        quart_3 = df[feature].quantile(0.75)
        iqr = quart_3 - quart_1
        
        # calculte the upper limits of the 'whiskers' in the box and whisker plots
        upper_lim = quart_3 + 1.5*iqr
        lower_lim = quart_1 - 1.5*iqr
        
        # replace outlier data
        df.loc[(df[feature] > upper_lim), feature] = upper_lim
        df.loc[(df[feature] < lower_lim), feature] = lower_lim
        
    return df

In [None]:
def bin_data(df: pd.DataFrame) -> pd.DataFrame:
    """ Put the data into bins to generalize observations
    
    SkinThickness data from: http://apjcn.nhri.org.tw/server/courses/obesity/anthro.doc#:~:text=For%20adults%2C%20the%20standard%20normal,either%20borderline%2C%20or%20fat%20depleted
    Glucose Data: https://www.ncbi.nlm.nih.gov/books/NBK541081/
    Blood Pressure: https://www.webmd.com/hypertension-high-blood-pressure/guide/diastolic-and-systolic-blood-pressure-know-your-numbers#1
    BMI: https://www.nhlbi.nih.gov/health/educational/lose_wt/BMI/bmi_tbl.pdf
    
    Label encode certain features based on research
    
    """
    df.loc[(df['Pregnancies'] == 0), 'No Preg v Preg'] = 0
    df.loc[(df['Pregnancies'] > 0), 'No Preg v Preg'] = 1
    
    df.loc[df['SkinThickness'] <= 9, 'Skin Fat'] = 0
    df.loc[df['SkinThickness'] > 30, 'Skin Fat'] = 2
    df.loc[(df['SkinThickness'] > 9) & (df['SkinThickness'] <= 30), 'Skin Fat'] = 1
    
    df.loc[df['Glucose'] >= 140, 'Glucose Group'] = 1
    df.loc[df['Glucose'] < 140, 'Glucose Group'] = 0
    
    # over 45 significantly higher risk of T2D, less than 30 is common for T1D diagnosis
    df.loc[df['Age'] >= 45, 'Age Group'] = 2
    df.loc[(df['Age'] >= 30) & (df['Age'] < 45), 'Age Group'] = 1
    df.loc[df['Age'] < 30, 'Age Group'] = 0
    
    df.loc[df['BMI'] < 18.5, 'BMI Group'] = -1
    df.loc[(df['BMI'] >= 18.5) & (df['BMI'] < 25), 'BMI Group'] = 0
    df.loc[(df['BMI'] >= 25) & (df['BMI'] < 30), 'BMI Group'] = 1
    df.loc[df['BMI'] >= 30, 'BMI Group'] = 2
    
    df.loc[df['BloodPressure'] < 80, 'BloodPressure Group'] = 0
    df.loc[(df['BloodPressure'] >= 80) & (df['BloodPressure'] < 90), 'BloodPressure Group'] = 1
    df.loc[(df['BloodPressure'] >= 90) & (df['BloodPressure'] <= 120), 'BloodPressure Group'] = 2
    df.loc[df['BloodPressure'] > 120, 'BloodPressure Group'] = 3
    
    df.head()
    
    return df

In [None]:
type(df_train.columns)

In [None]:
def ndarray_to_df(ndarray: np.ndarray, cols: pd.Index = df_train.columns) -> pd.DataFrame:
    """Convert an ndarray to a dataframe with certain columns
    
    Args:
        ndarray: The initial numpy array
        cols: The columns of the new dataframe
    
    Returns:
        A new dataframe with the column names included
    """
    new_df = pd.DataFrame(ndarray, columns=cols)
    return new_df

## 6. Machine learning
- Using a pipeline, find the combination of 'feature engineering functions' which will provide the best results for a basic model
- Use both and XGBoostClassifier and a RandomForrestClassifier
- Create an ANN and evaluate it's performance (scale data for ANN)

In [None]:
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler


test_feature_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('pandarizer', FunctionTransformer(ndarray_to_df)),
#     ('log_transform', FunctionTransformer(unskew_with_log)),
#     ('kmeans_cluster', FunctionTransformer(kmeans_cluster, kw_args={'n_clusters': 2})),
#     ('kmeans_dist', FunctionTransformer(kmeans_cluster_dist, kw_args={'n_clusters': 2})),
#     ('pc_pca', FunctionTransformer(create_pc_pca, kw_args={'feature_eng': False})),
    # from testing, this is a very good transformation function
    ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
#     ('log_transform', FunctionTransformer(unskew_with_log)),
#     ('above_median_counts', FunctionTransformer(row_median_counts)),
    ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  ['SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction', 'Age']})),
#     , kw_args={'features': ['Age', 'SkinThickness']}
    ('bin_data', FunctionTransformer(bin_data)),
#     ('log_transform', FunctionTransformer(unskew_with_log)),
#     ('view_data', Debug()),
#     ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(verbosity=0, random_state=0))
])

test_feature_pipeline

**About Scoring Method of Choice**
- calculate the cross validation score using F1 as the main guide for performance (to take into account precision and recall/sensitivity)
- remember: precision is ratio of true diabetics to all people predicted to be diabetic (TP/TP+FP)
- recall/sensitivity is the ratio of true diabetics to all TRUE diabetics (TP/TP+FN)
- we care less about specificity since we would rather flag someone as diabetic and have them checked by a doctor to determine that they are not diabetic. Therefore, F1 scores is a good measure to see how the model performs in the cases that we most care about; the cases that are diabetic
- change some of the functions in the feature pipeline to get a pipeline that creates the best features on the basic XGB Classifier. This will provide us with the best features to train our final model 


In [None]:
feature_scoring = score_pipeline_model(df_train, y_train, test_feature_pipeline)
print(feature_scoring)

In [None]:
outcome, feature_preds = get_preds(df_train, y_train, test_feature_pipeline)
print(classification_report(outcome, feature_preds))

In [None]:
cm = confusion_matrix(outcome, feature_preds)
ax = sns.heatmap(cm, annot=True)

- Feature engineering pipeline has been decided - test it's performance against a random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pandarizer', FunctionTransformer(ndarray_to_df)),
#     ('log_transform', FunctionTransformer(unskew_with_log)),
#     ('kmeans_cluster', FunctionTransformer(kmeans_cluster, kw_args={'n_clusters': 2})),
    ('kmeans_dist', FunctionTransformer(kmeans_cluster_dist, kw_args={'n_clusters': 2})),
#     ('pc_pca', FunctionTransformer(create_pc_pca, kw_args={'feature_eng': False})),
    # from testing, this is a very good transformation function
    ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
#     ('above_median_counts', FunctionTransformer(row_median_counts)),
     ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  [ 'SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction', 'Age']})),
    ('bin_data', FunctionTransformer(bin_data)),
    ('view_data', Debug()),
    ('model', RandomForestClassifier(random_state=0))
])

rf_pipeline

In [None]:
rf_scoring = score_pipeline_model(df_train, y_train, rf_pipeline)
print(rf_scoring)

In [None]:
outcome, feature_preds = get_preds(df_train, y_train, rf_pipeline)
print(classification_report(outcome, feature_preds))

In [None]:
cm = confusion_matrix(outcome, feature_preds)
ax = sns.heatmap(cm, annot=True)

In [None]:
from sklearn.svm import SVC

test_feature_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('pandarizer', FunctionTransformer(ndarray_to_df)),
#     ('log_transform', FunctionTransformer(unskew_with_log)),
    ('kmeans_cluster', FunctionTransformer(kmeans_cluster, kw_args={'n_clusters': 2})),
#     ('kmeans_dist', FunctionTransformer(kmeans_cluster_dist, kw_args={'n_clusters': 2})),
#     ('pc_pca', FunctionTransformer(create_pc_pca, kw_args={'feature_eng': False})),
    # from testing, this is a very good transformation function
    ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
#     ('log_transform', FunctionTransformer(unskew_with_log)),
#     ('above_median_counts', FunctionTransformer(row_median_counts)),
#     ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  ['SkinThickness', 'Insulin','Age']})),
#     , kw_args={'features': ['Age', 'SkinThickness']}
    ('bin_data', FunctionTransformer(bin_data)),
    ('log_transform', FunctionTransformer(unskew_with_log)),
#     ('view_data', Debug()),
    ('scaler', StandardScaler()),
    ('model', SVC(random_state=0, kernel='linear'))
])

test_feature_pipeline

In [None]:
feature_scoring = score_pipeline_model(df_train, y_train, test_feature_pipeline)
print(feature_scoring)

In [None]:
outcome, feature_preds = get_preds(df_train, y_train, test_feature_pipeline)
print(classification_report(outcome, feature_preds))

In [None]:
cm = confusion_matrix(outcome, feature_preds)
ax = sns.heatmap(cm, annot=True)

### Feature Engineering pipeline results
- Each model has similar results on the dataset
- Take each of these models and apply hyperparameter tuning to make the final predictions
- **Note:** The data was also trained on a **decision tree classifier** and **logistic regression** model but training results were not as good so these models have been omited from the notebook

## 7. Create a model using deep learning
- See if a neural network can have better results on the data

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

X_train_neural, X_test_neural, y_train_neural, y_test_neural = train_test_split(df_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
X_train_nt, X_valid_nt, y_train_nt, y_valid_nt = train_test_split(X_train_neural, y_train_neural, test_size=0.3, stratify=y_train_neural, random_state=42)

data_cleaning_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pandarizer', FunctionTransformer(ndarray_to_df)),
    ('log_transform', FunctionTransformer(unskew_with_log)),
    ('kmeans_cluster', FunctionTransformer(kmeans_cluster, kw_args={'n_clusters': 2})),
    ('kmeans_dist', FunctionTransformer(kmeans_cluster_dist, kw_args={'n_clusters': 2})),
    ('pc_pca', FunctionTransformer(create_pc_pca, kw_args={'feature_eng': False})),
    # from testing, this is a very good transformation function
    ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
    ('above_median_counts', FunctionTransformer(row_median_counts)),
     ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  [ 'SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction', 'Age']})),
    ('bin_data', FunctionTransformer(bin_data)),
    ('scaler', StandardScaler())
])

X_train_nt

In [None]:
X_train_nt = data_cleaning_pipeline.fit_transform(X_train_nt)
X_valid_nt = data_cleaning_pipeline.transform(X_valid_nt)
X_valid_nt

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

model = keras.Sequential([
    # normalize input data again in the inout layer
    layers.BatchNormalization(input_shape=[X_train_nt.shape[1]]),
    
    # hidden dense layer 1
    layers.Dense(8),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.25),
    
    # hidden dense layer 2
    layers.Dense(8),
#     layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.25),
    
    # output layer
    layers.Dense(1, activation='sigmoid')
])

# compile the model using adam and binary_crossentropy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name="binary_accuracy", dtype=None, threshold=0.4)])

# set early stopping to prevent over fitting
early_stopping = EarlyStopping(patience=10, min_delta=0.001, restore_best_weights=True)

# fit the model and check it's performance
history = model.fit(
    X_train_nt, y_train_nt,
    validation_data=(X_valid_nt, y_valid_nt),
    batch_size=140,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0, 
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot()

print((f"Best Validation Loss: {round(history_df['val_loss'].min(), 4)}" + f"\nBest Validation Accuracy: {round(history_df['val_binary_accuracy'].max(), 4)}"))

In [None]:
X_test_neural = data_cleaning_pipeline.transform(X_test_neural)
nn_preds = model.predict(X_test_neural)
nn_preds

In [None]:
binary_nn_preds = []

for pred in nn_preds:
    # 0.36 cutoff from the softmax function to predict a diabetic
    if pred >= 0.4:
        binary_nn_preds.append(1)
    else:
        binary_nn_preds.append(0)

print(classification_report(y_test_neural, binary_nn_preds))

In [None]:
cm = confusion_matrix(y_test_neural, binary_nn_preds)
ax = sns.heatmap(cm, annot=True)

## 8. Hyperparameter tuning and Final models
- Use the best features to create a final pipeline
- Split the data into train/test data 
- Apply hyperparameters on the train data
- Create a final model using the training data and best hyperparameters
- Use the validation data to test the final model; use classification report to see how the model did

In [None]:
def clean_data_for_nn(df, pipeline):
    
    clean_df = pipeline.transform(df)
    return clean_df
    

In [None]:
df_test_nn = clean_data_for_nn(df_test, data_cleaning_pipeline)
final_nn_preds = model.predict(df_test_nn)

final_binary_nn_preds = []

for pred in final_nn_preds:
    # 0.4 cutoff from the softmax function to predict a diabetic
    if pred >= 0.4:
        final_binary_nn_preds.append(1)
    else:
        final_binary_nn_preds.append(0)

print(classification_report(y_test, final_binary_nn_preds))
cm = confusion_matrix(y_test, final_binary_nn_preds)
ax = sns.heatmap(cm, annot=True)

In [None]:
# use optuna for hyperparameter tuning
import optuna

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1), 
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 10),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'subsample': trial.suggest_float("subsample", 0.5, 1.0),  
        'gamma': trial.suggest_float('gamma', 0.5, 2.5)
    }
    
    xgb_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('pandarizer', FunctionTransformer(ndarray_to_df)),
        # from testing, this is a very good transformation function
        ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
        ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  ['SkinThickness', 'Insulin',
           'DiabetesPedigreeFunction', 'Age']})),
        ('bin_data', FunctionTransformer(bin_data)),
        ('model', xgb.XGBClassifier(**params))
    ])
    
    return score_pipeline_model(df_train, y_train, xgb_pipeline, tuning=True)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=45)
xgb_best_params = study.best_params

In [None]:
print(xgb_best_params)

In [None]:
# use optuna for rf hyperparameter tuning
def rf_objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 10, 100, 10),
        'n_estimators': trial.suggest_int('n_estimators', 200, 3000, 100),
        # after feature engineering, there are around 30 features
        'max_features': trial.suggest_int("max_features", 3, 10),
        'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 15),
        'min_samples_split': trial.suggest_int("min_samples_split", 2, 20),  
    }
    
    rf_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('pandarizer', FunctionTransformer(ndarray_to_df)),
        ('kmeans_dist', FunctionTransformer(kmeans_cluster_dist, kw_args={'n_clusters': 2})),
        ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
        ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  [ 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction', 'Age']})),
        ('bin_data', FunctionTransformer(bin_data)),
        ('view_data', Debug()),
        ('model', RandomForestClassifier(**params))
    ])
    
    return score_pipeline_model(df_train, y_train, rf_pipeline, tuning=True)

study_2 = optuna.create_study(direction='maximize')
study_2.optimize(rf_objective, n_trials=30)
rf_best_params = study_2.best_params

**Final Model**
- Create a final model using the best hyperparametrs and evalute it on the test set

In [None]:
final_xgb_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('pandarizer', FunctionTransformer(ndarray_to_df)),
    ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
    ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  ['SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction', 'Age']})),
    ('bin_data', FunctionTransformer(bin_data)),
    ('view_data', Debug()),
    ('model', xgb.XGBClassifier(**xgb_best_params))
])

final_xgb_pipeline.fit(df_train, y_train)
# print(final_xgb_pipeline['view_data'].X_)
final_predictions = final_xgb_pipeline.predict(df_test)

print(classification_report(y_test, final_predictions))

In [None]:
cm = confusion_matrix(y_test, final_predictions)
ax = sns.heatmap(cm, annot=True)

In [None]:
final_rf_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('pandarizer', FunctionTransformer(ndarray_to_df)),
#     ('kmeans_dist', FunctionTransformer(kmeans_cluster_dist, kw_args={'n_clusters': 2})),
    ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
#     ('remove_outliers', FunctionTransformer(remove_outliers, kw_args={'features':  [ 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction', 'Age']})),
    ('bin_data', FunctionTransformer(bin_data)),
    ('view_data', Debug()),
    ('model', RandomForestClassifier(**rf_best_params))
])

final_rf_pipeline.fit(df_train, y_train)
final_rf_preds = final_rf_pipeline.predict(df_test)

print(classification_report(y_test, final_rf_preds))

In [None]:
cm = confusion_matrix(y_test, final_rf_preds)
ax = sns.heatmap(cm, annot=True)

In [None]:
# use optuna for hyperparameter tuning
import optuna

def objective(trial):
    params = {
        'C': trial.suggest_int('C', 1, 3),
        'gamma': trial.suggest_float('gamma', 0.01, 0.1)
    }
    
    svm_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('pandarizer', FunctionTransformer(ndarray_to_df)),
        ('kmeans_cluster', FunctionTransformer(kmeans_cluster, kw_args={'n_clusters': 2})),
        ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
        ('bin_data', FunctionTransformer(bin_data)),
        ('log_transform', FunctionTransformer(unskew_with_log)),
        ('scaler', StandardScaler()),
        ('model', SVC(**params))
    ])
    
    return score_pipeline_model(df_train, y_train, svm_pipeline, tuning=True)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=35)
svm_best_params = study.best_params

In [None]:
final_svm_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('pandarizer', FunctionTransformer(ndarray_to_df)),
        ('kmeans_cluster', FunctionTransformer(kmeans_cluster, kw_args={'n_clusters': 2})),
        ('pc_loading_features', FunctionTransformer(pca_loading_features, kw_args={'ignore_features': []})),
        ('bin_data', FunctionTransformer(bin_data)),
        ('log_transform', FunctionTransformer(unskew_with_log)),
        ('scaler', StandardScaler()),
        ('model', SVC(**svm_best_params))
])

final_svm_pipeline.fit(df_train, y_train)
# print(final_xgb_pipeline['view_data'].X_)
final_svm_predictions = final_svm_pipeline.predict(df_test)

print(classification_report(y_test, final_svm_predictions))

In [None]:
cm = confusion_matrix(y_test, final_svm_predictions)
ax = sns.heatmap(cm, annot=True)

## 9. Final Thoughts
- Other than the neural network, each of the tuned final models performed relatively similarly on the final test set
- Hope you learned from the feature engineering techniques!

## 