In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc,roc_auc_score, precision_score, recall_score,f1_score, roc_curve
from matplotlib import pyplot as plt
from collections import Counter



# **Import data**

In [2]:
data_path = 'https://raw.githubusercontent.com/sahdan96/randomforest/main/train_2v.csv'
data = pd.read_csv(data_path)

# **pre-processing data**

In [3]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.dropna(axis=0, inplace=True)
data.drop(columns ='id', inplace =True)

In [4]:
encode_gender = LabelEncoder()
encode_marry = LabelEncoder()
encode_work = LabelEncoder()
encode_residence = LabelEncoder()
encode_smoking = LabelEncoder()
data['gender'] = encode_gender.fit_transform(data['gender'])
data['ever_married'] = encode_marry.fit_transform(data['ever_married'])
data['work_type'] = encode_work.fit_transform(data['work_type'])
data['Residence_type'] = encode_residence.fit_transform(data['Residence_type'])
data['smoking_status'] = encode_smoking.fit_transform(data['smoking_status'])

# **split data**

In [5]:
x = data.drop('stroke', axis=1)
y = data.stroke

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state =42)

In [7]:
print("train class count: \n" +str(y_train.value_counts())+"\n\ntest class count: \n" + str(y_test.value_counts()))

train class count: 
0    23580
1      506
Name: stroke, dtype: int64

test class count: 
0    5890
1     132
Name: stroke, dtype: int64


# **training + testing/prediction**

In [8]:
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
rf_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=100, learning_rate=1,random_state=0)
# rf_model = EasyEnsembleClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)



AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                       

In [9]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# **train & test benchmark**

In [10]:
acc = accuracy_score(y_train, y_train_pred)
pre = precision_score(y_train, y_train_pred)
re = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)

acc2 = accuracy_score(y_test, y_test_pred)
pre2 = precision_score(y_test, y_test_pred)
re2 = recall_score(y_test, y_test_pred)
f12 = f1_score(y_test, y_test_pred)
print("training benchmark:")
print("accuracy:\t"+str(acc)+"\nprecision:\t"+str(pre)+"\nrecall:\t\t"+str(re)+"\nf1:\t\t"+str(f1))
print("\ntesting benchmark:")
print("accuracy:\t"+str(acc2)+"\nprecision:\t"+str(pre2)+"\nrecall:\t\t"+str(re2)+"\nf1:\t\t"+str(f12))

training benchmark:
accuracy:	1.0
precision:	1.0
recall:		1.0
f1:		1.0

testing benchmark:
accuracy:	0.9551643972102292
precision:	0.09411764705882353
recall:		0.12121212121212122
f1:		0.10596026490066225


In [11]:
cm= confusion_matrix(y_train,y_train_pred)
cm2= confusion_matrix(y_test,y_test_pred)
print("confusion matrix for training:\n"+str(cm)+"\n\nconfusion matrix for testing:\n"+str(cm2))

confusion matrix for training:
[[23580     0]
 [    0   506]]

confusion matrix for testing:
[[5736  154]
 [ 116   16]]


In [12]:

# from numpy import mean
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold

# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores = cross_val_score(rf_model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# # summarize performance
# print('Mean ROC AUC: %.3f' % mean(scores))

In [13]:
roc_auc_score(y_test, y_test_pred)

0.5475330555126819