In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc,roc_auc_score, precision_score, recall_score,f1_score, roc_curve
from matplotlib import pyplot as plt
from collections import Counter

data_path = 'https://raw.githubusercontent.com/sahdan96/randomforest/main/train_2v.csv'
data = pd.read_csv(data_path)

data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.dropna(axis=0, inplace=True)
data.drop(columns ='id', inplace =True)

encode_gender = LabelEncoder()
encode_marry = LabelEncoder()
encode_work = LabelEncoder()
encode_residence = LabelEncoder()
encode_smoking = LabelEncoder()
data['gender'] = encode_gender.fit_transform(data['gender'])
data['ever_married'] = encode_marry.fit_transform(data['ever_married'])
data['work_type'] = encode_work.fit_transform(data['work_type'])
data['Residence_type'] = encode_residence.fit_transform(data['Residence_type'])
data['smoking_status'] = encode_smoking.fit_transform(data['smoking_status'])

x = data.drop('stroke', axis=1)
y = data.stroke

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)



In [2]:
rus = RandomUnderSampler(random_state=0)
X_train_resampled1, Y_train_resampled1 = rus.fit_resample(X_train,y_train)

nm = NearMiss(version=3)
X_train_resampled2, Y_train_resampled2 = nm.fit_resample(X_train,y_train)



In [3]:
def train(xtrain, ytrain, xtest,ytest):
  rf_model = RandomForestClassifier(bootstrap=True, max_depth=None, 
                                  max_features='auto', max_leaf_nodes= None,
                                  min_impurity_decrease= 0.0, min_impurity_split=None,
                                  min_samples_leaf= 1, min_samples_split= 2,
                                  min_weight_fraction_leaf=0.0, 
                                  n_estimators=10, random_state=0,
                                n_jobs=None, oob_score=False,
                       verbose=0, warm_start=False)

  rf_model.fit(xtrain, ytrain)
  y_train_pred = rf_model.predict(xtrain)
  y_test_pred = rf_model.predict(xtest)
  acc = accuracy_score(ytrain, y_train_pred)
  pre = precision_score(ytrain, y_train_pred)
  re = recall_score(ytrain, y_train_pred)
  f1 = f1_score(ytrain, y_train_pred)

  acc2 = accuracy_score(ytest, y_test_pred)
  pre2 = precision_score(ytest, y_test_pred)
  re2 = recall_score(ytest, y_test_pred)
  f12 = f1_score(ytest, y_test_pred)
  print("training benchmark:")
  print("accuracy:\t"+str(acc)+"\nprecision:\t"+str(pre)+"\nrecall:\t\t"+str(re)+"\nf1:\t\t"+str(f1))
  print("\ntesting benchmark:")
  print("accuracy:\t"+str(acc2)+"\nprecision:\t"+str(pre2)+"\nrecall:\t\t"+str(re2)+"\nf1:\t\t"+str(f12))
  cm= confusion_matrix(ytrain,y_train_pred)
  cm2= confusion_matrix(ytest,y_test_pred)
  print("confusion matrix for training:\n"+str(cm)+"\n\nconfusion matrix for testing:\n"+str(cm2))

In [4]:
train(X_train_resampled1, Y_train_resampled1, X_test, y_test)

training benchmark:
accuracy:	0.9860557768924303
precision:	0.988
recall:		0.9840637450199203
f1:		0.9860279441117764

testing benchmark:
accuracy:	0.746263699767519
precision:	0.05555555555555555
recall:		0.6397058823529411
f1:		0.10223266745005874
confusion matrix for training:
[[496   6]
 [  8 494]]

confusion matrix for testing:
[[4407 1479]
 [  49   87]]


In [5]:
train(X_train_resampled2, Y_train_resampled2, X_test, y_test)

training benchmark:
accuracy:	0.9850597609561753
precision:	0.9919191919191919
recall:		0.9780876494023905
f1:		0.9849548645937813

testing benchmark:
accuracy:	0.7045831949518433
precision:	0.04386451971127152
recall:		0.5808823529411765
f1:		0.08156943727413526
confusion matrix for training:
[[498   4]
 [ 11 491]]

confusion matrix for testing:
[[4164 1722]
 [  57   79]]
