In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Memuat dataset dari file CSV
df = pd.read_csv("brain_stroke.csv")

# Preprocessing

In [3]:
# Mengecek missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [4]:
# Mengecek imbalance data
print("\nImbalance data before resampling:")
print(df['stroke'].value_counts())


Imbalance data before resampling:
stroke
0    4733
1     248
Name: count, dtype: int64


In [5]:
# Menghapus data di mana avg_glucose_level di bawah 100 pada stroke bernilai 1
df = df[~((df['stroke'] == 1) & (df['avg_glucose_level'] < 100))]

In [None]:
# One-Hot Encoding untuk kolom 'gender'
ohc = OneHotEncoder(drop=None, sparse_output=False)
df_gender = ohc.fit_transform(df[['gender']])
df_gender2 = pd.DataFrame(df_gender, columns=ohc.get_feature_names_out(['gender']))
df = pd.concat([df.drop(columns=['gender']), df_gender2], axis=1)



In [None]:
# Encoding untuk 'smoking_status' dan 'ever_married'
label_encoder = LabelEncoder()
smoking_mapping = {"never smoked": 1, "Unknown": 0, "formerly smoked": 2, "smokes": 3}
df['smoking_status'] = df['smoking_status'].map(smoking_mapping)
ever_married_mapping = {"Yes": 1, "No": 0}
df['ever_married'] = df['ever_married'].map(ever_married_mapping)

In [8]:
# One-Hot Encoding untuk 'work_type'
df_work_type = ohc.fit_transform(df[['work_type']])
df_work_type2 = pd.DataFrame(df_work_type, columns=ohc.get_feature_names_out(['work_type']))
df = pd.concat([df.drop(columns=['work_type']), df_work_type2], axis=1)
df.rename(columns={'work_type_Self-employed': 'work_type_Self_employed'}, inplace=True)



In [None]:
# Encoding untuk 'Residence_type'
residence_mapping = {"Urban": 1, "Rural": 0}
df['Residence_type'] = df['Residence_type'].map(residence_mapping)

In [10]:
# Menghapus baris dengan nilai NaN di kolom stroke
df = df.dropna(subset=['stroke'])

In [11]:
# Memastikan tidak ada nilai NaN di dataframe setelah preprocessing
df = df.dropna()

In [None]:
# Hapus fitur 'work_type_nan'
# df = df.drop(columns=['work_type_nan'])

# Processing

In [14]:
# Scaling menggunakan StandardScaler
scaler = StandardScaler()
X = df.drop(['stroke'], axis='columns')
X_scaled = scaler.fit_transform(X)
y = df['stroke']

In [15]:
#Resampling menggunakan RandomOverSampler
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_scaled, y)

In [16]:
# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [17]:
# Membuat model Logistic Regression
lg = LogisticRegression(solver='liblinear', max_iter=200)

In [18]:
# Melatih model
lg.fit(X_train, y_train)

In [19]:
# Hyperparameter tuning menggunakan GridSearchCV
param_grid = {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(estimator=lg, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluasi

In [20]:
# Evaluasi model
y_pred = grid_search.predict(X_test)
print("Parameter terbaik:", grid_search.best_params_)
print("Akurasi terbaik pada data latih (resampled):", grid_search.best_score_)
print("Akurasi Model:", accuracy_score(y_test, y_pred))
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Parameter terbaik: {'C': 0.7, 'penalty': 'l2'}
Akurasi terbaik pada data latih (resampled): 0.8656832094976948
Akurasi Model: 0.8534342888047594

Laporan Klasifikasi:
              precision    recall  f1-score   support

         0.0       0.87      0.83      0.85       908
         1.0       0.84      0.88      0.86       941

    accuracy                           0.85      1849
   macro avg       0.85      0.85      0.85      1849
weighted avg       0.85      0.85      0.85      1849


Confusion Matrix:
[[752 156]
 [115 826]]


In [21]:
df.head(10)

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,gender_Female,gender_Male,work_type_Govt_job,work_type_Private,work_type_Self_employed,work_type_children
0,67.0,0.0,1.0,1,1,228.69,36.6,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,80.0,0.0,1.0,1,0,105.92,32.5,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,49.0,0.0,0.0,1,1,171.23,34.4,3,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,79.0,1.0,0.0,1,0,174.12,24.0,1,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,81.0,0.0,0.0,1,1,186.21,29.0,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0
9,61.0,0.0,1.0,1,0,120.46,36.8,3,1.0,0.0,1.0,0.0,1.0,0.0,0.0
10,54.0,0.0,0.0,1,1,104.51,27.3,3,1.0,0.0,1.0,0.0,1.0,0.0,0.0
11,79.0,0.0,1.0,1,1,214.09,28.2,1,1.0,1.0,0.0,1.0,0.0,0.0,0.0
12,50.0,1.0,0.0,1,0,167.41,30.9,1,1.0,1.0,0.0,0.0,0.0,1.0,0.0
13,64.0,0.0,1.0,1,1,191.61,37.5,3,1.0,1.0,0.0,0.0,0.0,1.0,0.0


# Simpan model

In [22]:
import pickle

In [23]:
filename = 'brain_stroke_model.sav'
pickle.dump(grid_search, open(filename,'wb'))

In [24]:
with open('scaler.sav', 'wb') as file:
    pickle.dump(scaler, file)