In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc,roc_auc_score, precision_score, recall_score,f1_score, roc_curve
from matplotlib import pyplot as plt
from collections import Counter



# **Import data**

In [2]:
data_path = 'https://raw.githubusercontent.com/sahdan96/randomforest/main/train_2v.csv'
data = pd.read_csv(data_path)

# **pre-processing data**

In [3]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.dropna(axis=0, inplace=True)
data.drop(columns ='id', inplace =True)

In [4]:
encode_gender = LabelEncoder()
encode_marry = LabelEncoder()
encode_work = LabelEncoder()
encode_residence = LabelEncoder()
encode_smoking = LabelEncoder()
data['gender'] = encode_gender.fit_transform(data['gender'])
data['ever_married'] = encode_marry.fit_transform(data['ever_married'])
data['work_type'] = encode_work.fit_transform(data['work_type'])
data['Residence_type'] = encode_residence.fit_transform(data['Residence_type'])
data['smoking_status'] = encode_smoking.fit_transform(data['smoking_status'])

# **undersampling**

In [5]:
x = data.drop('stroke', axis=1)
y = data.stroke

In [6]:
rus = RandomUnderSampler(random_state=0)
X_train_resampled, Y_train_resampled = rus.fit_resample(x,y)



In [7]:
## train class count
print("before sampling:\n"+ str(Counter(y))+"\n\nafter sampling:\n"+ str(Counter(Y_train_resampled)))

before sampling:
Counter({0: 29470, 1: 638})

after sampling:
Counter({0: 638, 1: 638})


# **split data**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_train_resampled, Y_train_resampled, test_size=0.2)

In [9]:
print("train class count: \n" +str(Counter(y_train))+"\n\ntest class count: \n" + str(Counter(y_test)))

train class count: 
Counter({0: 511, 1: 509})

test class count: 
Counter({1: 129, 0: 127})


# **training + testing/prediction**

In [10]:
rf_model = RandomForestClassifier(bootstrap=True, max_depth=None, 
                                  max_features='auto', max_leaf_nodes= None,
                                  min_impurity_decrease= 0.0, min_impurity_split=None,
                                  min_samples_leaf= 1, min_samples_split= 2,
                                  min_weight_fraction_leaf=0.0, 
                                  n_estimators=10, random_state=42,
                                n_jobs=None, oob_score=False,
                       verbose=0, warm_start=False)

rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [11]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# **train & test benchmark**

In [12]:
acc = accuracy_score(y_train, y_train_pred)
pre = precision_score(y_train, y_train_pred)
re = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)

acc2 = accuracy_score(y_test, y_test_pred)
pre2 = precision_score(y_test, y_test_pred)
re2 = recall_score(y_test, y_test_pred)
f12 = f1_score(y_test, y_test_pred)
print("training benchmark:")
print("accuracy:\t"+str(acc)+"\nprecision:\t"+str(pre)+"\nrecall:\t\t"+str(re)+"\nf1:\t\t"+str(f1))
print("\ntesting benchmark:")
print("accuracy:\t"+str(acc2)+"\nprecision:\t"+str(pre2)+"\nrecall:\t\t"+str(re2)+"\nf1:\t\t"+str(f12))

training benchmark:
accuracy:	0.9901960784313726
precision:	0.994059405940594
recall:		0.9862475442043221
f1:		0.9901380670611439

testing benchmark:
accuracy:	0.6875
precision:	0.6737588652482269
recall:		0.7364341085271318
f1:		0.7037037037037037


In [13]:
cm= confusion_matrix(y_train,y_train_pred)
cm2= confusion_matrix(y_test,y_test_pred)
print("confusion matrix for training:\n"+str(cm)+"\n\nconfusion matrix for testing:\n"+str(cm2))

confusion matrix for training:
[[508   3]
 [  7 502]]

confusion matrix for testing:
[[81 46]
 [34 95]]


In [14]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.70      0.64      0.67       127
           1       0.67      0.74      0.70       129

    accuracy                           0.69       256
   macro avg       0.69      0.69      0.69       256
weighted avg       0.69      0.69      0.69       256



In [19]:
rr = rf_model.predict_proba(X_test)[:,1]
float(rr[1])

0.3