In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Data 

This dataset looks to predict stroke using attributes related to a person's medical history, demographic information, and type of work. 
First I am reading in the raw data, then creating a new dataframe where categorical attributes are turned into binary columns.

In [None]:
raw_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df1 = pd.get_dummies(raw_df)

## Inspect Data

In [None]:
raw_df.describe()

In [None]:
df1.info()

From the output above, we can see that we're missing BMI for some records. Lets impute the missing data with the mean.

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df1[['bmi']])
df1['bmi'] = imp.transform(df1[['bmi']])

Now we can see that we have no null values in our dataset. 

In [None]:
df1.info()

## EDA

Lets do some preliminary analysis to understand the significance of the attributes and state any assumptions we're making.

This seems to be an imbalanced dataset. This may skew our accuracy metrics later down the line. 

In [None]:
sns.countplot(x='stroke', data=df1)

In [None]:
g = sns.FacetGrid(raw_df, col="smoking_status", height=10, aspect=.5)
g.map(sns.barplot, "stroke", "age", order=[1,0]) 

In [None]:
g = sns.FacetGrid(raw_df, col="gender", height=10, aspect=.5, hue="gender")
g.map(sns.barplot, "stroke", "age", order=[1,0]) 

In [None]:
g = sns.FacetGrid(raw_df, col="stroke", height=8, aspect=.5, hue="stroke")
g.map(sns.barplot, "work_type", "age" ) 

In [None]:
sns.jointplot(x='age',y='bmi', data=df1, hue='stroke', alpha=.2, height=15)

Age is consistently showing some correlation to the target variable. Higher age seems to be correlated with more strokes.

In [None]:
sns.relplot(data=df1, x="age", y="bmi", col="stroke", alpha=.3)

## Check for Multicolinearity

After creating dummy variables we should remove our categorical features that are binary. This will help the performance of our model as it removes colinearity. The correlation matrix below shows that age has the strongest correlation coeffecient to our target variable.

In [None]:
df1= df1.drop(['gender_Female','ever_married_Yes','Residence_type_Rural'], axis=1)

In [None]:
corr = df1.corr().round(3)
plt.figure(figsize=(20,20))
sns.heatmap(corr, annot = True)

## Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X= df1.loc[:, df1.columns != 'stroke']
y= df1.loc[:,['stroke']]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42, stratify= y)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

# Fit the regressor to the training data
linreg.fit(X_train, y_train)

# Predict on the test data: y_pred
y_pred = linreg.predict(X_test)


# Compute and print R^2 and RMSE
print("R^2: {}".format(linreg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Printing the coefficients of our linear model and the intercept.

In [None]:
print(linreg.coef_, linreg.intercept_)

Linear Regression Conclusion:
Using a linear model to calculate a binary outcome doesn't make sense and it shows in the results of the model's performance. It would make more sense to use a logistic regression, which is another multi-regression linear model. 

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

# Fit model to training data
logreg.fit(X_train, y_train)

# Predict test using test data
y_pred = logreg.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(logreg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

We can see here that the r^2 has dramatically improved from the linear regression. We can see that our model explains 94% of the variance. 
There are other accuracy metrics that help use assess if the model is performing well. 

### Evaluate the Model

Here from the confusion matrix we can see that there accuracy may show a metric of 95% but in fact, we have not captured any true negatives. 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# Import necessary modules
from sklearn.metrics import roc_curve

# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[: ,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the hyperparameter grid
c_space = np.logspace(-5, 10, 10)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state=42)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid, cv=5)

# Fit it to the training data
logreg_cv.fit(X_train,y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

In [None]:
logreg = LogisticRegression()

# Fit model to training data
logreg.fit(X_train, y_train)

# Predict test using test data
y_pred = logreg.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(logreg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))
 

In [None]:
my_submission = pd.DataFrame({'Id': X_test.id, 'Stroke': y_pred})
my_submission= my_submission.reset_index(drop=True)
my_submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))

# Summary

After conducting some EDA and cleaning the dataset, I was able to split the dataset into test and train sets and then run a linear regression. However, our objective is to predict for a binary outcome, in which case, a logistic regression is a more appropriate model. After fitting the first logistic regression, I was able to achieve 95% accuracy. However, the underlying model is extremely unbalanced, with far fewer samples where the target is 1 vs 0. As a result, the accuracy metric does not tell the whole story about our model's performance. Of course, the appropriate assessment of performance ultimately depends on the objective of the model. For example, in a case like this where we're predicting something serious like the liklihood of a stroke, we may want to compromise accuracy for more false positives than false negatives. 
After the first logistic regression was run, I also conducted a gridsearch to fine tune my hyperparameters using GridSearchCV. After I retrieved my optimal hyper-parameters, I ran the logistic regression again and yielded a slightly lower r^2 score but capture more true negatives. 