1. Import Library

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

2. Load Dataset

In [4]:
stroke_dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [5]:
stroke_dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
stroke_dataset.shape

(5110, 12)

In [8]:
from sklearn.preprocessing import LabelEncoder

# Membuat instance dari LabelEncoder
label_encoder = LabelEncoder()

# Menghapus baris dengan nilai NaN di kolom 'bmi'
stroke_dataset.dropna(subset=['bmi'], inplace=True)

# Menghapus kolom 'id'
stroke_dataset.drop(columns=['id'], inplace=True)

# Meng-encode kolom-kolom kategorikal
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']
for col in categorical_cols:
    stroke_dataset[col] = label_encoder.fit_transform(stroke_dataset[col])

# Menampilkan hasil
print(stroke_dataset)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
5          1  81.0             0              0             1          2   
...      ...   ...           ...            ...           ...        ...   
5104       0  13.0             0              0             0          4   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level   bmi  smoking_status  stroke  
0              

In [9]:
stroke_dataset['stroke'].value_counts()

stroke
0    4700
1     209
Name: count, dtype: int64

In [10]:
# memisahkan data dan label
X = stroke_dataset.drop (columns='stroke', axis=1)
Y = stroke_dataset['stroke']

In [11]:
print(X)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
5          1  81.0             0              0             1          2   
...      ...   ...           ...            ...           ...        ...   
5104       0  13.0             0              0             0          4   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level   bmi  smoking_status  
0                  1   

In [12]:
print(Y)

0       1
2       1
3       1
4       1
5       1
       ..
5104    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 4909, dtype: int64


3. Standarisasi Data

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(X)

In [15]:
standardized_data = scaler.transform(X)

In [16]:
print(standardized_data)

[[ 1.19842812  1.07013796 -0.31806673 ...  2.77769839  0.98134488
  -0.35178804]
 [ 1.19842812  1.64656262 -0.31806673 ...  0.0138418   0.45926914
   0.58523176]
 [-0.83302341  0.27201152 -0.31806673 ...  1.48413156  0.70120668
   1.52225157]
 ...
 [-0.83302341 -0.34875349 -0.31806673 ... -0.50236926  0.21733161
   0.58523176]
 [ 1.19842812  0.36069224 -0.31806673 ...  1.37291993 -0.41934612
  -0.35178804]
 [-0.83302341  0.05030973 -0.31806673 ... -0.45081569 -0.34294479
  -1.28880785]]


In [17]:
X = standardized_data
Y = stroke_dataset['stroke']

In [18]:
print(X)
print(Y)

[[ 1.19842812  1.07013796 -0.31806673 ...  2.77769839  0.98134488
  -0.35178804]
 [ 1.19842812  1.64656262 -0.31806673 ...  0.0138418   0.45926914
   0.58523176]
 [-0.83302341  0.27201152 -0.31806673 ...  1.48413156  0.70120668
   1.52225157]
 ...
 [-0.83302341 -0.34875349 -0.31806673 ... -0.50236926  0.21733161
   0.58523176]
 [ 1.19842812  0.36069224 -0.31806673 ...  1.37291993 -0.41934612
  -0.35178804]
 [-0.83302341  0.05030973 -0.31806673 ... -0.45081569 -0.34294479
  -1.28880785]]
0       1
2       1
3       1
4       1
5       1
       ..
5104    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 4909, dtype: int64


In [30]:
from imblearn.over_sampling import SMOTE

# Menyeimbangkan data dengan SMOTE
smote = SMOTE(random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X, Y)

4. Memisahkan Data training dan Data testing

In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)

In [42]:
print(X.shape, X_train.shape, X_test.shape)

(4909, 10) (7520, 10) (1880, 10)


5. Membuat data latih dengan algoritma SVM

In [43]:
classifier = svm.SVC(kernel='linear')

In [44]:
classifier.fit(X_train, Y_train)

6. Membuat model evaluasi untuk mengukur tingkat akurasi

In [45]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [46]:
print('Akurasi data Training adalah =', training_data_accuracy)

Akurasi data Training adalah = 0.7800531914893617


In [47]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [48]:
print('Akurasi data testing adalah =', test_data_accuracy)

Akurasi data testing adalah = 0.7803191489361702


7. Membuat Model Prediksi

In [52]:
# Input data baru
input_data = (1, 67, 0, 1, 1, 2, 1, 228.69, 36.6, 1)

# Mengubah input data menjadi numpy array
input_data_as_numpy_array = np.array(input_data)

# Mereshape data untuk prediksi
input_data_reshape = input_data_as_numpy_array.reshape(1, -1)

# Menskalakan input data
std_data = scaler.transform(input_data_reshape)
print(std_data)

# Membuat prediksi
prediction = classifier.predict(std_data)
print(prediction)

# Menampilkan hasil prediksi
if prediction[0] == 0:
    print("pasien tidak stroke")
else:
    print("pasien terkena stroke")

[[ 1.19842812  1.07013796 -0.31806673  4.38196829  0.72948428 -0.15569667
   0.98563987  2.77769839  0.98134488 -0.35178804]]
[1]
pasien terkena stroke




8. Simpan Model

In [51]:
import pickle

In [53]:
filename = 'stroke_model.sav'
pickle.dump(classifier, open(filename,'wb'))