In [None]:
#mengimport data
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
raw_data.head()

In [None]:
#melihat data yang belum beres atau belum selesai diolah
raw_data.info()

In [None]:
#feature yang perlu dimasukan
cat_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
cont_features = ['age', 'avg_glucose_level', 'bmi']

In [None]:
for f in cat_features + ['stroke']:
    print(f + ':')
    print(raw_data[f].value_counts(), '\n')

In [None]:
raw_data['stroke'].value_counts(normalize = True)

In [None]:
data = raw_data.copy()

data.drop('id', axis=1, inplace=True)

In [None]:
data['smoking_status'].replace('Unknown', np.nan, inplace=True)


In [None]:

for f in cat_features:
    data[f] = data[f].astype('category')

In [None]:

data['smoking_status'] = data['smoking_status'].cat.reorder_categories(new_categories = ['never smoked', 'formerly smoked', 'smokes']).cat.as_ordered()

In [None]:
data.isna().sum()

In [None]:
na_summary = data.isna().sum()
total_count = data.shape[0]
print('missing data ratio')
print('bmi:            {:.2f}'.format(na_summary['bmi'] / total_count))
print('smoking_status: {:.2f}'.format(na_summary['smoking_status'] / total_count))

print()
print('missing bmi vs. stroke')
print(data[data['bmi'].isna()]['stroke'].value_counts())

print()
print('missing smoking_status vs. stroke')
print(data[data['smoking_status'].isna()]['stroke'].value_counts())

In [None]:
#melihat data dalam bentuk diagram
for f in cat_features:
    sns.countplot(x = data[f])
    plt.show()

In [None]:
#Melihat persebarannya
sns.pairplot(data[['stroke'] + cont_features].sort_values('stroke'), hue='stroke', height=4)

In [None]:

data_label_enc = data.copy()
for f in cat_features:
    data_label_enc[f] = data_label_enc[f].cat.codes

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(data_label_enc.corr(), annot=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

data_fs = data.copy()

data_fs.dropna(inplace = True)

for f in cat_features:
    data_fs[f] = data_fs[f].cat.codes

data_fs = MinMaxScaler().fit_transform(data_fs)

X_fs = data_fs[:,:-1]
Y_fs = data_fs[:,-1:]

fit = SelectKBest(score_func = chi2, k = 4).fit(X_fs, Y_fs)

df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(data.columns)
feature_scores = pd.concat([df_columns, df_scores],axis = 1)
feature_scores.columns = ['feature','score']
fs_results = feature_scores.nlargest(20,'score')
fs_results

In [None]:
final_features = fs_results['feature'][0:4].values
data_final_features = data.copy()[final_features]
data_final_features.head()

In [None]:
data_final_features.isna().sum()

In [None]:
data_enc = data_final_features.copy()

for f in data_final_features.select_dtypes('category').columns:
    data_enc[f] = data_enc[f].cat.codes

In [None]:
from sklearn.model_selection import train_test_split

y = data['stroke']
X = data_enc.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
print("dataset size: " + str(y_train.size))
print("stroke ratio: " + str(y_train.sum() / y_train.size))

In [None]:
from imblearn.combine import SMOTEENN

smt = SMOTEENN(random_state=42, sampling_strategy = 0.7)
X_train, y_train = smt.fit_resample(X_train, y_train)

print("dataset size: " + str(y_train.size))
print("stroke ratio: " + str(y_train.sum() / y_train.size))

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix

plot_confusion_matrix(model, X_test, y_test)  

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


param_grid = {
    'bootstrap': [True, False],
    'n_estimators': [100, 200, 400]
}

model = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid, scoring = 'f1')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

model.best_estimator_

In [None]:
plot_confusion_matrix(model, X_test, y_test)  

print(classification_report(y_test, y_pred))