# script scratch

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from scipy import stats

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv') # grab dataset

In [None]:
def prep_data(df): # clean dataframe, impute nulls in BMI (exploration-ready)
    """ 
        Takes the original Kaggle dataset,
        Drops a row for an outlier in gender and reset index, 
        Drop the id column because the index serves the same,
        Converts ordinal columns to objects for one-hot encoding,
        Creates age_range feature for 5-year increments,
        Imputes BMI nulls using average BMI for observation's age range and gender, and
        Returns the prepared dataframe. This does not do model prep work.
    """
    
    # drop the outlier in gender ("other"), reset index
    df = df.drop(3116).reset_index().drop(columns='index')
    
    # drop id column (index is just as valuable)
    df = df.drop(columns='id')
    
    # fix the annoying capitalization
    df = df.rename(columns={'Residence_type':'residence_type'})
    
    # convert ordinal columns to objects
    df['hypertension'] = df.hypertension.astype('object')
    df['heart_disease'] = df.heart_disease.astype('object')

    # make age groups list
    five_year_cutpoints = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
    
    # create new column for age range
    df['age_range'] = pd.cut(x=df.age, bins=five_year_cutpoints).astype('string')
    
    # fix values in age range for something more readable (replaces ", ", "-" e.g. '0-5')
    df['age_range'] = df['age_range'].str[1:-1].str.replace(', ', '-').astype('object')

    # calculate mean BMI for each age range and gender combination
    grouped = df.groupby(['age_range', 'gender']).bmi.mean().round(1)
    
    # assign BMI to nulls using the average BMI for the observation's age range and gender
    # based on index for rows with null values for bmi
    df.loc[df.bmi.isna(), 'bmi'] = df[df.bmi.isna()].apply(lambda x: grouped[x.age_range][x.gender], axis=1)

    return df # return exploration-ready dataframe

In [3]:
df = prep_data(df) # prep

In [None]:
def model_prep(df): # encode, split, isolate target, scale, SMOTE cleaned kaggle dataset (model-ready)
    """ 
        Takes the dataframe already put through prep_data as input,
        One-hot encodes categorical and ordinal columns,
        Splits data into 60-20-20 train-validate-test splits,
        Isolates the target column from each split,
        Scales each split's features,
        Uses SMOTE to address class imbalance for train, and
        Return all prepared dataframes. Requires prior prep function.
    """
    # set list of columns to one-hot encode
    col_list = ['gender','ever_married','work_type','residence_type','smoking_status', 'age_range']
    
    # apply one-hot encoding using above list
    df = pd.get_dummies(df, columns=col_list, drop_first=True)
    
    # split
    trainvalidate, test = train_test_split(df, test_size=.2, random_state=777)
    train, validate = train_test_split(trainvalidate, test_size=.25, random_state=777)
    
    # isolate target for each split
    X_train, y_train = train.drop(columns='stroke'), train.stroke
    X_validate, y_validate = validate.drop(columns='stroke'), validate.stroke
    X_test, y_test = test.drop(columns='stroke'), test.stroke
    
    # apply scaling using the Min_Max_Scaler function from model.py
    scaler,\
    X_train_scaled,\
    X_validate_scaled,\
    X_test_scaled = model.Min_Max_Scaler(X_train, X_validate, X_test)
    
    # use SMOTE+Tomek to address class imbalances between stroke and not-stroke (function in model.py)
    X_train_smtom, y_train_smtom = model.smoter(X_train_scaled, y_train)

    # return dataframes required for modeling
    return X_train_smtom, y_train_smtom, X_validate_scaled, y_validate, X_test_scaled, y_test

In [4]:
X_train_smtom, y_train_smtom, X_validate_scaled, y_validate, X_test_scaled, y_test = model_prep(df)

Before SMOTE applied: (3064, 31) (3064,)
After SMOTE applied: (5828, 31) (5828,)


In [5]:
X_train_smtom.shape, y_train_smtom.shape, X_validate_scaled.shape, y_validate.shape, X_test_scaled.shape, y_test.shape

((5828, 31), (5828,), (1022, 31), (1022,), (1022, 31), (1022,))

In [6]:
# Fit the model
nb = GaussianNB(var_smoothing=0.00001).fit(X_train_smtom, y_train_smtom)

# Use the model
# We'll evaluate the model's performance on test
y_predictions = nb.predict(X_test_scaled)
y_pred_proba = nb.predict_proba(X_test_scaled)

# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_test, y_predictions, output_dict=True)
print("NaiveBayes var_smoothing = 0.00001")
pd.DataFrame(report)

NaiveBayes var_smoothing = 0.00001


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.994071,0.098837,0.542074,0.546454,0.946769
recall,0.519628,0.944444,0.542074,0.732036,0.542074
f1-score,0.682497,0.178947,0.542074,0.430722,0.65589
support,968.0,54.0,0.542074,1022.0,1022.0


In [58]:
y_test.head(10)

4650    0
5020    0
101     1
414     0
3875    0
36      1
838     0
1492    0
935     0
954     0
Name: stroke, dtype: int64

In [61]:
y_actuals = y_test.reset_index().head(10)

In [62]:
# df = acquire.get_telco_data()
# merged = X_test.merge((df.customer_id), left_index=True, right_index=True, how='left')
# customer_id = merged.customer_id
probability = pd.DataFrame(y_pred_proba, columns= ['no_stroke_%', 'stroke_%'])
prediction = pd.DataFrame(y_predictions, columns= ['stroke_pred'])
final_pred = pd.concat([probability, prediction, y_actuals], axis=1)

# customer = pd.DataFrame(customer_id)
# customer.reset_index(drop=True, inplace=True)
# final = pd.concat([final_pred, customer], axis=1)
# final["churn_pred"] = final.churn_pred.replace(to_replace = [1, 0],value = ["Yes","No"])

In [63]:
final_pred.head()

Unnamed: 0,no_stroke_%,stroke_%,stroke_pred,index,stroke
0,1.17227e-15,1.0,1,4650.0,0.0
1,1.0,7.821194e-56,0,5020.0,0.0
2,7.025892e-18,1.0,1,101.0,1.0
3,1.0,1.7665899999999999e-134,0,414.0,0.0
4,1.0,0.0,0,3875.0,0.0


In [64]:
final_pred["Accurate"] = (final_pred.stroke == final_pred.stroke_pred)

In [72]:
final_pred

Unnamed: 0,no_stroke_%,stroke_%,stroke_pred,index,stroke,Accurate
0,1.172270e-15,1.000000e+00,1,4650.0,0.0,False
1,1.000000e+00,7.821194e-56,0,5020.0,0.0,True
2,7.025892e-18,1.000000e+00,1,101.0,1.0,True
3,1.000000e+00,1.766590e-134,0,414.0,0.0,True
4,1.000000e+00,0.000000e+00,0,3875.0,0.0,True
...,...,...,...,...,...,...
1017,2.338322e-03,9.976617e-01,1,,,False
1018,1.000000e+00,0.000000e+00,0,,,False
1019,2.906160e-17,1.000000e+00,1,,,False
1020,1.000000e+00,0.000000e+00,0,,,False


In [11]:
X_train_smtom.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,...,age_range_40-45,age_range_45-50,age_range_5-10,age_range_50-55,age_range_55-60,age_range_60-65,age_range_65-70,age_range_70-75,age_range_75-80,age_range_80-85
0,0.621582,0.0,0.0,0.415936,0.247205,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.438477,0.0,0.0,0.20589,0.134161,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.645996,0.0,0.0,0.137245,0.163975,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.707031,0.0,0.0,0.173114,0.254658,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.072266,0.0,0.0,0.241344,0.077019,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_range
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,65-70
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.6,never smoked,1,60-65
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,75-80
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,45-50
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,75-80


In [34]:
user_input = {"age": 25, 
              "hypertension": 1, 
              "heart_disease": 1, 
              "avg_glucose_level": 100, 
              "bmi": 26, 
              "gender": "Male",
              "work_type": "Private",
              "residence_type": "Urban",
              "ever_married" : "No",
              "smoking_status" : "never smoked"
             }

In [35]:
user_input
# user_pred_proba = nb.predict_proba(user_input)

{'age': 25,
 'hypertension': 1,
 'heart_disease': 1,
 'avg_glucose_level': 100,
 'bmi': 26,
 'gender': 'Male',
 'work_type': 'Private',
 'residence_type': 'Urban',
 'ever_married': 'No',
 'smoking_status': 'never smoked'}

In [36]:
user_input = pd.DataFrame(user_input, index=[0])

In [50]:
user_input

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender,work_type,residence_type,ever_married,smoking_status,age_range
0,25,1,1,100,26,Male,Private,Urban,No,never smoked,20-25


In [38]:
 
def prep_data1(df):
    # convert ordinal columns to objects
    df['hypertension'] = df.hypertension.astype('object')
    df['heart_disease'] = df.heart_disease.astype('object')

    # make age groups list
    five_year_cutpoints = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
    
    # create new column for age range
    df['age_range'] = pd.cut(x=df.age, bins=five_year_cutpoints).astype('string')
    
    # fix values in age range for something more readable (replaces ", ", "-" e.g. '0-5')
    df['age_range'] = df['age_range'].str[1:-1].str.replace(', ', '-').astype('object')

    # calculate mean BMI for each age range and gender combination
    grouped = df.groupby(['age_range', 'gender']).bmi.mean().round(1)
    
    # assign BMI to nulls using the average BMI for the observation's age range and gender
    # based on index for rows with null values for bmi
    df.loc[df.bmi.isna(), 'bmi'] = df[df.bmi.isna()].apply(lambda x: grouped[x.age_range][x.gender], axis=1)
    
    return df

In [44]:
df = prep_data1(user_input)

In [51]:
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
0,25,1,1,100,26


In [46]:
from sklearn.preprocessing import MinMaxScaler
# set list of columns to one-hot encode
col_list = ['gender','ever_married','work_type','residence_type','smoking_status', 'age_range']
    
# apply one-hot encoding using above list
df = pd.get_dummies(df, columns=col_list, drop_first=True)
scaler = MinMaxScaler().fit(df)
ui_scaled = scaler.transform(df)

In [47]:
ui_scaled

array([[0., 0., 0., 0., 0.]])

In [48]:
# new best model tree_depth_3