In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc,roc_auc_score, precision_score, recall_score,f1_score, roc_curve
from matplotlib import pyplot as plt
from collections import Counter

# **Import data**

In [2]:
data_path = 'https://raw.githubusercontent.com/sahdan96/randomforest/main/train_2v.csv'
data = pd.read_csv(data_path)

# **pre-processing data**

In [3]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.dropna(axis=0, inplace=True)
data.drop(columns ='id', inplace =True)

In [4]:
encode_gender = LabelEncoder()
encode_marry = LabelEncoder()
encode_work = LabelEncoder()
encode_residence = LabelEncoder()
encode_smoking = LabelEncoder()
data['gender'] = encode_gender.fit_transform(data['gender'])
data['ever_married'] = encode_marry.fit_transform(data['ever_married'])
data['work_type'] = encode_work.fit_transform(data['work_type'])
data['Residence_type'] = encode_residence.fit_transform(data['Residence_type'])
data['smoking_status'] = encode_smoking.fit_transform(data['smoking_status'])

# **split data**

In [5]:
x = data.drop('stroke', axis=1)
y = data.stroke

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
print("train class count: \n" +str(y_train.value_counts())+"\n\ntest class count: \n" + str(y_test.value_counts()))

train class count: 
0    23556
1      530
Name: stroke, dtype: int64

test class count: 
0    5914
1     108
Name: stroke, dtype: int64


In [8]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [9]:
for feature in zip(data, clf.feature_importances_):
    print(feature)

('gender', 0.030876746593532983)
('age', 0.17966999028652708)
('hypertension', 0.018237476196986863)
('heart_disease', 0.016431106120261562)
('ever_married', 0.014789072071129268)
('work_type', 0.041886187715357195)
('Residence_type', 0.03192947055931422)
('avg_glucose_level', 0.344191871117918)
('bmi', 0.27649237523194903)
('smoking_status', 0.045495704107023804)


In [10]:
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(clf, threshold=0.15)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=-1,
                                                 oob_score=False,
 

In [11]:
for feature_list_index in sfm.get_support(indices=True):
    print(data.columns[feature_list_index])

age
avg_glucose_level
bmi


In [12]:
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [13]:
# from imblearn.ensemble import BalancedRandomForestClassifier

clf_important = RandomForestClassifier(n_estimators=100, random_state=0, class_weight={0:1, 1: 47})

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 47}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [14]:
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
acc2 = accuracy_score(y_test, y_important_pred)
pre2 = precision_score(y_test, y_important_pred)
re2 = recall_score(y_test, y_important_pred)
f12 = f1_score(y_test, y_important_pred)
acc2, pre2,re2,f12

(0.9818997010959813, 0.4, 0.018518518518518517, 0.035398230088495575)

In [15]:
y_important_train_pred = clf_important.predict(X_important_train)

# View The Accuracy Of Our Limited Feature (2 Features) Model
acc3 = accuracy_score(y_train, y_important_train_pred)
pre3 = precision_score(y_train, y_important_train_pred)
re3 = recall_score(y_train, y_important_train_pred)
f13 = f1_score(y_train, y_important_train_pred)
acc3, pre3,re3,f13

(0.9999169642115752, 1.0, 0.9962264150943396, 0.998109640831758)

In [16]:
X_important_train.shape

(24086, 3)

In [20]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
x_new3 , y_new3 =ros.fit_resample(X_important_train, y_train)
rf3 = RandomForestClassifier()
rf3.fit(x_new3, y_new3)
y_tpred2=rf3.predict(x_new3)
y_pred2=rf3.predict(X_important_test)
print(confusion_matrix(y_new3, y_tpred2))
print(confusion_matrix(y_test, y_pred2))



[[23556     0]
 [    0 23556]]
[[5871   43]
 [ 102    6]]
