In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [2]:
df = pd.read_csv('df_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Create Dummy Variables

We can see from the .head method that the data has 7 categorical variables: gender, hypertension, heart_disease, ever_married, work_type, Residence_type and smoking_status. hypertension and heart_disease are already in the form of dummy variables, but the other 5 categorical variables are not. In this step, I will convert these 5 variables into 16 dummy variables, concatenate the dummy variables into the dataframe, and then drop the original 5 variables. 

In [4]:
gender = pd.get_dummies(df['gender'])
ever_married = pd.get_dummies(df['ever_married'])
work_type = pd.get_dummies(df['work_type'])
Residence_type = pd.get_dummies(df['Residence_type'])
smoking_status = pd.get_dummies(df['smoking_status'])

In [5]:
df = pd.concat([df, gender, ever_married, work_type, Residence_type, smoking_status], axis=1)

In [6]:
df = df.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis=1)

In [7]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,Female,Male,Other,...,Never_worked,Private,Self-employed,children,Rural,Urban,Unknown,formerly smoked,never smoked,smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,202.21,,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0


We can see from the .head method that there is an 'Unknown' column. We should eliminate that column along with the other columns of that ilk.

In [8]:
df.columns

Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi', 'stroke', 'Female', 'Male', 'Other', 'No', 'Yes', 'Govt_job',
       'Never_worked', 'Private', 'Self-employed', 'children', 'Rural',
       'Urban', 'Unknown', 'formerly smoked', 'never smoked', 'smokes'],
      dtype='object')

In [9]:
df = df.drop(['Other', 'Unknown'], axis=1)

In [10]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,Female,Male,No,...,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,formerly smoked,never smoked,smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,51676,61.0,0,0,202.21,,1,1,0,0,...,0,0,0,1,0,1,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,...,0,0,1,0,0,0,1,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,...,0,0,0,1,0,1,0,0,1,0


# Split Into Train/Test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='stroke'), 
                                                    df.stroke, test_size=0.3)

# Standardize Features With MinMaxScaler

In [12]:
scaler = MinMaxScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)