In [57]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [58]:
# Load the data
file_path = Path("../Stroke-Prediction-Analysis/stroke_encoded_no_NaN.csv")
stroke_df = pd.read_csv(file_path)
stroke_df.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Hypertension,Heart_Disease,Ever_Married,Work_Type,Residence_Type,Average_Glucose,BMI,Smoking_Status,Stroke
0,0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,5,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [77]:
stroke_df.groupby("Stroke").size()

Stroke
0    4699
1     209
dtype: int64

In [59]:
# Remove "ID" column
stroke_df= stroke_df.drop('Unnamed: 0',axis=1)
stroke_df.head()

Unnamed: 0,Gender,Age,Hypertension,Heart_Disease,Ever_Married,Work_Type,Residence_Type,Average_Glucose,BMI,Smoking_Status,Stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [60]:
# Standardize the data
stroke_scaled = StandardScaler().fit_transform(stroke_df.drop(columns="Stroke"))
print(stroke_scaled[0:5])

[[ 1.20024032  1.06993757 -0.31810241  4.3814987   0.72927032 -0.15571291
   0.98543612  2.77779723  0.98114481 -0.35182832]
 [ 1.20024032  1.64633634 -0.31810241  4.3814987   0.72927032 -0.15571291
  -1.01477913  0.01401584  0.45908589  0.58510786]
 [-0.83316648  0.27184695 -0.31810241 -0.22823241  0.72927032 -0.15571291
   0.98543612  1.48426559  0.70101563  1.52204404]
 [-0.83316648  1.60199798  3.14364174 -0.22823241  0.72927032  0.75954335
  -1.01477913  1.54932488 -0.62323138  0.58510786]
 [ 1.20024032  1.69067471 -0.31810241 -0.22823241  0.72927032 -0.15571291
   0.98543612  1.8214933   0.01342584 -0.35182832]]


In [61]:
# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the data.
stroke_pca = pca.fit_transform(stroke_scaled)

In [62]:
# Transform PCA data to a DataFrame
stroke_df_pca = pd.DataFrame(
    data=stroke_pca, columns=["principal component 1", "principal component 2",]
)
stroke_df_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,2.488985,3.767351
1,2.25363,2.46436
2,1.532946,-0.559824
3,1.924982,1.590286
4,1.422948,1.184692


In [63]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.263662  , 0.12536727])

In [64]:
# Initialize PCA model for 3 principal components
pca = PCA(n_components=8)

# Get two principal components for the iris data.
stroke_pca = pca.fit_transform(stroke_scaled)

In [65]:
# Transform PCA data to a DataFrame
stroke_df_pca = pd.DataFrame(
    data=stroke_pca, columns=["principal component 1", "principal component 2", "principal component 3",
                              "principal component 4","principal component 5","principal component 6",
                              "principal component 7","principal component 8"]
)
stroke_df_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8
0,2.488985,3.767351,1.199682,-0.936427,-1.723709,-1.854032,1.269086,1.158969
1,2.25363,2.46436,-0.665549,-2.012249,-2.791431,0.255296,0.601441,1.189048
2,1.532946,-0.559824,0.885919,0.744877,0.301305,-0.831053,1.347727,-0.577539
3,1.924982,1.590286,-1.033246,2.533615,0.360102,1.362742,-0.348467,-1.587259
4,1.422948,1.184692,0.994409,-0.52848,0.692127,-1.459186,-0.381744,-1.180354


In [66]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.263662  , 0.12536727, 0.10002026, 0.0950183 , 0.09064806,
       0.08276817, 0.08059674, 0.07020833])

# Logistic Regression to fit


In [67]:
y=stroke_df["Stroke"]
X=stroke_df_pca

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(3681, 8)

In [69]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [70]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [71]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [72]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9584352078239609


In [73]:
len(set(y_pred))

2

In [74]:
results.tail(5)

Unnamed: 0,Prediction,Actual
1222,0,0
1223,0,0
1224,0,0
1225,0,0
1226,0,0


In [76]:
results.groupby("Actual").size()

Actual
0    1175
1      52
dtype: int64

In [80]:
from collections import Counter

In [81]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 3524, 1: 3524})

In [82]:
# WE train again with "LogisticRegression" model, then predict, and assess accuracy.
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [84]:
from sklearn.metrics import balanced_accuracy_score

In [85]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7365302782324059

In [88]:
list(Counter(y_pred))

[0, 1]

In [89]:
(dict(Counter(y_pred))).items()

dict_items([(0, 839), (1, 388)])

In [90]:
(dict(Counter(y_test))).items()

dict_items([(0, 1175), (1, 52)])