# Capstone - Diabetes Prediction - Preprocessing and Training

## Setup

In [1]:
# import packages
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

## Import Data

In [2]:
# Set path for our data that's gone through our datawrangling process
dw_file = '../data/df_dw.csv'

# Read raw data into a dataframe
df_dw = pd.read_csv(dw_file)

In [3]:
# Let's review our column names
df_dw.dtypes

Pregnancies                   int64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
BMI Category                 object
dtype: object

## One-Hot-Encode Categorical Variables

In [4]:
# Let's One-hot-encode the BMI Category.
df_dw_ohe = pd.get_dummies(df_dw, drop_first = True)

In [5]:
df_dw_ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733 entries, 0 to 732
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Pregnancies                   733 non-null    int64  
 1   Glucose                       728 non-null    float64
 2   BloodPressure                 731 non-null    float64
 3   SkinThickness                 541 non-null    float64
 4   Insulin                       394 non-null    float64
 5   BMI                           731 non-null    float64
 6   DiabetesPedigreeFunction      733 non-null    float64
 7   Age                           733 non-null    int64  
 8   Outcome                       733 non-null    int64  
 9   BMI Category_Class II Obese   733 non-null    uint8  
 10  BMI Category_Class III Obese  733 non-null    uint8  
 11  BMI Category_Normal Weight    733 non-null    uint8  
 12  BMI Category_Overweight       733 non-null    uint8  
 13  BMI C

## Define X and y

In [6]:
# Let's create our X and y, and then we can create a model for various subsets of features
X = df_dw_ohe.drop('Outcome',axis = 1)
y = df_dw_ohe['Outcome']

X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'BMI Category_Class II Obese',
       'BMI Category_Class III Obese', 'BMI Category_Normal Weight',
       'BMI Category_Overweight', 'BMI Category_Underweight'],
      dtype='object')

## Split into Train and Test Set

In [7]:
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=12)

## Save Data

In [8]:
# We're going to save our training and test sets to csv files

datapath = '../data'
datapath_X_train = os.path.join(datapath, 'X_train.csv')
datapath_X_test = os.path.join(datapath, 'X_test.csv')
datapath_y_train = os.path.join(datapath, 'y_train.csv')
datapath_y_test = os.path.join(datapath, 'y_test.csv')
X_train.to_csv(datapath_X_train, index=False)
X_test.to_csv(datapath_X_test, index=False)
y_train.to_csv(datapath_y_train, index=False)
y_test.to_csv(datapath_y_test, index=False)