In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc,roc_auc_score, precision_score, recall_score,f1_score, roc_curve
from matplotlib import pyplot as plt
from collections import Counter

data_path = 'https://raw.githubusercontent.com/sahdan96/randomforest/main/train_2v.csv'
data = pd.read_csv(data_path)

data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.dropna(axis=0, inplace=True)
data.drop(columns ='id', inplace =True)

encode_gender = LabelEncoder()
encode_marry = LabelEncoder()
encode_work = LabelEncoder()
encode_residence = LabelEncoder()
encode_smoking = LabelEncoder()
data['gender'] = encode_gender.fit_transform(data['gender'])
data['ever_married'] = encode_marry.fit_transform(data['ever_married'])
data['work_type'] = encode_work.fit_transform(data['work_type'])
data['Residence_type'] = encode_residence.fit_transform(data['Residence_type'])
data['smoking_status'] = encode_smoking.fit_transform(data['smoking_status'])

x = data.drop('stroke', axis=1)
y = data.stroke

ros = RandomOverSampler(random_state=0)
smote = SMOTE(random_state=0)
adasyn = ADASYN(random_state=0)

X_resampled_ros, Y_resampled_ros = ros.fit_resample(x,y)
X_resampled_smote, Y_resampled_smote = smote.fit_resample(x,y)
X_resampled_adasyn, Y_resampled_adasyn = adasyn.fit_resample(x,y)

X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_resampled_ros, Y_resampled_ros, test_size=0.2)
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_resampled_smote, Y_resampled_smote, test_size=0.2)
X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn = train_test_split(X_resampled_adasyn, Y_resampled_adasyn, test_size=0.2)

def model(xtrain, ytrain):
    rf_model = RandomForestClassifier(bootstrap=True, max_depth=None, 
                                  max_features='auto', max_leaf_nodes= None,
                                  min_impurity_decrease= 0.0, min_impurity_split=None,
                                  min_samples_leaf= 1, min_samples_split= 2,
                                  min_weight_fraction_leaf=0.0, 
                                  n_estimators=5, random_state=0,
                                n_jobs=None, oob_score=False,
                       verbose=0, warm_start=False)
    rf_model.fit(xtrain, ytrain)
    return rf_model

ros = model(X_train_ros,y_train_ros)
smote =model(X_train_smote,y_train_smote)
adasyn = model(X_train_adasyn,y_train_adasyn)

y_train_predicted_ros = ros.predict(X_train_ros)
y_train_predicted_smote = smote.predict(X_train_smote)
y_train_predicted_adasyn = adasyn.predict(X_train_adasyn)

y_predicted_ros = ros.predict(X_test_ros)
y_predicted_smote = smote.predict(X_test_smote)
y_predicted_adasyn = adasyn.predict(X_test_adasyn)

def metrics(y_test, y_predicted):
    cm = confusion_matrix(y_test, y_predicted)
    TN = cm[0][0]
    TP = cm[1][1]
    FN = cm[1][0]
    FP = cm[0][1]
    print('confusion matrix: TN = '+ str(TN)+',TP = '+str(TP)+',FN = '+str(FN)+',FP = '+str(FP))
    accuracy = (TP+TN)/(TP+TN+FN+FP)
    recall = TP/(FN+TP)
    precision =TP/(FP+TP)
    f1 = 2*((recall*precision)/(precision+recall))
    print("Accuracy\t=\t" , accuracy)
    print("Precision\t=\t" ,precision)
    print("Recall\t\t=\t" ,recall)
    print("F1 Score\t=\t" ,f1)
    pass



In [None]:
print("ROS:")
metrics(y_train_ros,y_train_predicted_ros)
print("\nSMOTE:")
metrics(y_train_smote,y_train_predicted_smote)
print("\nADASYN:")
metrics(y_train_adasyn,y_train_predicted_adasyn)

ROS:
confusion matrix: TN = 23504,TP = 23627,FN = 0,FP = 21
Accuracy	=	 0.9995546318289786
Precision	=	 0.9991119756427604
Recall		=	 1.0
F1 Score	=	 0.999555790586991

SMOTE:
confusion matrix: TN = 23528,TP = 23522,FN = 90,FP = 12
Accuracy	=	 0.9978367831693248
Precision	=	 0.9994900994306111
Recall		=	 0.9961883787904455
F1 Score	=	 0.9978365078691723

ADASYN:
confusion matrix: TN = 23581,TP = 23505,FN = 89,FP = 10
Accuracy	=	 0.997901875596058
Precision	=	 0.9995747395279608
Recall		=	 0.9962278545392896
F1 Score	=	 0.9978984907342546


In [None]:
print("ROS:")
metrics(y_test_ros,y_predicted_ros)
print("\nSMOTE:")
metrics(y_test_smote,y_predicted_smote)
print("\nADASYN:")
metrics(y_test_adasyn,y_predicted_adasyn)

ROS:
confusion matrix: TN = 5830,TP = 5940,FN = 0,FP = 18
Accuracy	=	 0.998473023413641
Precision	=	 0.9969788519637462
Recall		=	 1.0
F1 Score	=	 0.9984871406959154

SMOTE:
confusion matrix: TN = 5918,TP = 5741,FN = 120,FP = 9
Accuracy	=	 0.9890566677977605
Precision	=	 0.9984347826086957
Recall		=	 0.9795256782119093
F1 Score	=	 0.9888898458358453

ADASYN:
confusion matrix: TN = 5874,TP = 5777,FN = 139,FP = 7
Accuracy	=	 0.9876239721963211
Precision	=	 0.998789764868603
Recall		=	 0.9765043948613928
F1 Score	=	 0.9875213675213677
