In [20]:
import pandas as pd

In [21]:
df = pd.read_csv("stroke.csv")

In [22]:
df = df.dropna()

In [23]:
print(df.dtypes)

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [24]:
df_y = df['stroke']
df_x = df.drop('stroke', axis=1)

In [26]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.25, random_state=42)

In [28]:
symmetric_cols = ['hypertension', 'heart_disease']
skewed_cols = ['id', 'age', 'avg_glucose_level', 'bmi']
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

symmetric_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='mean')),
  ('scaler', StandardScaler())
])

skewed_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='mean')),
  ('log_transform', FunctionTransformer(np.log1p)),
  ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
  ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric_symmetric', symmetric_pipeline, symmetric_cols),
    ('numeric_skewed', skewed_pipeline, skewed_cols),
    ('categorical', categorical_pipeline, categorical_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [30]:
pipeline.fit(df)
train_transformed = pipeline.transform(train_x)
test_transformed = pipeline.transform(test_x)

In [33]:
print(train_transformed.shape)
print(test_transformed.shape)

(3681, 22)
(1228, 22)


In [34]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()
regr.fit(train_transformed, train_y)

In [35]:
y_pred = regr.predict(test_transformed)
print(y_pred)

[0.14227295 0.0291748  0.09458923 ... 0.04992676 0.01409912 0.00593567]


In [36]:
y_ans = []
for temp in y_pred:
  if temp > 0.5:
    y_ans.append(1)
  else:
    y_ans.append(0)

In [38]:
from sklearn.metrics import accuracy_score

print("Accuracy: ", accuracy_score(y_ans, test_y) * 100, "%")

Accuracy:  94.86970684039088 %


In [39]:
from sklearn import svm

model = svm.SVC(kernel='linear')
model.fit(train_transformed, train_y)

predictions = model.predict(test_transformed)

print("Accuracy: ", accuracy_score(predictions, test_y) * 100, "%")

Accuracy:  94.86970684039088 %
