In [1]:
# Script_v2

#Make imports
# call post_analysis_model_prep()
# make model
# take user input and put inside dictionary
# run user input against model and make a prediction
# output prediction

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
def post_analysis_model_prep():
    """ 
        Acquires the required data for modeling,
        Selects columns chosen from exploration,
        Encodes ever_married, hypertension, and heart_disease columns,
        Splits data into train (60%), validate (20%), and test (20%),
        Isolates target from each split into X and y,
        Applies SMOTE+Tomek oversampling to address the class imbalance, and
        Returns the dataframes.
    """
    # acquire data
    df = pd.read_csv('healthcare-dataset-stroke-data.csv')
    # select columns
    df = df[['stroke','hypertension','heart_disease','ever_married','avg_glucose_level', 'age']]
    # encode columns
    df = pd.get_dummies(df, columns=['hypertension','heart_disease','ever_married'])
    # split data
    trainvalidate, test = train_test_split(df, test_size=.2, random_state=777)
    train, validate = train_test_split(trainvalidate, test_size=.25, random_state=777)
    # isolate target
    X_train, y_train = train.drop(columns='stroke'), train.stroke
    X_validate, y_validate = validate.drop(columns='stroke'), validate.stroke
    X_test, y_test = test.drop(columns='stroke'), test.stroke
    # apply SMOTE+Tomek oversampling
    X_train_smtom, y_train_smtom = model.smoter(X_train, y_train)

    # return dataframes
    return X_train_smtom, y_train_smtom, X_validate, y_validate, X_test, y_test

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = post_analysis_model_prep()

Before SMOTE applied: (3066, 8) (3066,)
After SMOTE applied: (5614, 8) (5614,)


In [5]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape

((5614, 8), (5614,), (1022, 8), (1022,), (1022, 8), (1022,))

In [7]:
tree = DecisionTreeClassifier(max_depth=1, random_state=123).fit(X_train, y_train)

In [8]:
# def decisiontree(X_insample, y_insample, X_outsample, y_outsample):
#     """ Creates decision trees with max_depth 1,2,3,5,10 and random_state=123 """
#     # set loop list
#     max_depths = [1]
#     # loop through max depths
#     for depth in max_depths:
#         # create decision trees
#         tree = DecisionTreeClassifier(max_depth=depth, random_state=123)\
#             .fit(X_insample, y_insample.in_actuals)
#         # make predictions in new columns
#         y_insample['tree_maxdepth' + str(depth)] = tree.predict(X_insample)
#         y_outsample['tree_maxdepth' + str(depth)] = tree.predict(X_outsample)

#     return y_insample, y_outsample # return dataframe with predictions appended

In [16]:
X_train.head(1)

Unnamed: 0,avg_glucose_level,age,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_No,ever_married_Yes
0,94.77,75.0,1,0,1,0,0,1


In [15]:
X_train.columns

Index(['avg_glucose_level', 'age', 'hypertension_0', 'hypertension_1',
       'heart_disease_0', 'heart_disease_1', 'ever_married_No',
       'ever_married_Yes'],
      dtype='object')

In [12]:
user_input = X_train.head(0)

In [31]:
age = input('What is your age?') 
glucose = input('What is your glucose level in mg/dL? (0-300)')
ht = input('Do you have hypertension? yes/no')
hd = input('Do you have heart disease? yes/no')
married = input('Have you ever been married? yes/no')

What is your age? 25
What is your glucose level in mg/dL? (0-300) 100
Do you have hypertension? yes/no yes
Do you have heart disease? yes/no yes
Have you ever been married? yes/no no


In [32]:
user_inputdict = {'avg_glucose_level': glucose, 
                  'age': age, 
                  'hypertension_0': ht=='no', 
                  'hypertension_1': ht=='yes',
                  'heart_disease_0': hd=='no', 
                  'heart_disease_1': hd=='yes', 
                  'ever_married_No': married=='no',
                  'ever_married_Yes': married=='yes'
                 }

In [33]:
user_inputdict

{'avg_glucose_level': '100',
 'age': '25',
 'hypertension_0': False,
 'hypertension_1': True,
 'heart_disease_0': False,
 'heart_disease_1': True,
 'ever_married_No': True,
 'ever_married_Yes': False}

In [34]:
user_input1 = user_input.append(user_inputdict, ignore_index=True)

In [35]:
pred = tree.predict_proba(user_input1)

In [36]:
pred[0][1]

0.10396039603960396