In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc,roc_auc_score, precision_score, recall_score,f1_score, roc_curve
from matplotlib import pyplot as plt
from collections import Counter

# **Import data**

In [2]:
data_path = 'https://raw.githubusercontent.com/sahdan96/randomforest/main/train_2v.csv'
data = pd.read_csv(data_path)

# **pre-processing data**

In [3]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.dropna(axis=0, inplace=True)
data.drop(columns ='id', inplace =True)

In [4]:
encode_gender = LabelEncoder()
encode_marry = LabelEncoder()
encode_work = LabelEncoder()
encode_residence = LabelEncoder()
encode_smoking = LabelEncoder()
data['gender'] = encode_gender.fit_transform(data['gender'])
data['ever_married'] = encode_marry.fit_transform(data['ever_married'])
data['work_type'] = encode_work.fit_transform(data['work_type'])
data['Residence_type'] = encode_residence.fit_transform(data['Residence_type'])
data['smoking_status'] = encode_smoking.fit_transform(data['smoking_status'])

# **split data**

In [5]:
x = data.drop('stroke', axis=1)
y = data.stroke

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
print("train class count: \n" +str(y_train.value_counts())+"\n\ntest class count: \n" + str(y_test.value_counts()))

train class count: 
0    23582
1      504
Name: stroke, dtype: int64

test class count: 
0    5888
1     134
Name: stroke, dtype: int64


# **BRF**

In [8]:
brf = RandomForestClassifier(bootstrap=True, max_depth=None, 
                                  max_features='auto', max_leaf_nodes= None,
                                  min_impurity_decrease= 0.0, min_impurity_split=None,
                                  min_samples_leaf= 1, min_samples_split= 2,
                                  min_weight_fraction_leaf=0.0, 
                                  n_estimators=100, random_state=42,
                                n_jobs=None, oob_score=False,
                       verbose=0, warm_start=False, class_weight="balanced")

In [9]:
brf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

# **training + testing/prediction**

In [10]:
y_train_pred = brf.predict(X_train)
y_test_pred = brf.predict(X_test)

# **train & test benchmark**

In [11]:
def metric(ytrue, ypred):
    acc = accuracy_score(ytrue, ypred)
    pre = precision_score(ytrue, ypred)
    re = recall_score(ytrue, ypred)
    f1 = f1_score(ytrue, ypred)
    print("accuracy:\t"+str(acc)+"\nprecision:\t"+str(pre)+"\nrecall:\t\t"+str(re)+"\nf1:\t\t"+str(f1))

In [12]:
print("training benchmark:")
metric(y_train, y_train_pred)
print("\ntesting benchmark:")
metric(y_test, y_test_pred)

training benchmark:
accuracy:	1.0
precision:	1.0
recall:		1.0
f1:		1.0

testing benchmark:
accuracy:	0.9777482563932248
precision:	0.0
recall:		0.0
f1:		0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
cm= confusion_matrix(y_train,y_train_pred)
cm2= confusion_matrix(y_test,y_test_pred)
print("confusion matrix for training:\n"+str(cm)+"\n\nconfusion matrix for testing:\n"+str(cm2))

confusion matrix for training:
[[23582     0]
 [    0   504]]

confusion matrix for testing:
[[5888    0]
 [ 134    0]]
