##### Load Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

##### Load dataframe after EDA done

In [2]:
cvd = pd.read_csv(r"C:\Users\HP\Documents\ML\CVD pred\new_df1.csv")

In [3]:
cvd

Unnamed: 0,Height_(cm),Weight_(kg),General_Health,Checkup,Exercise,Heart_Disease,Cancer,Depression,Diabetes,Sex,Age_Category,Smoking_History,Alcohol_Consumption
0,-0.485838,-0.171950,4,4,0,1,0,0,1,0,10,0,0
1,-0.692963,0.500429,4,4,1,0,0,0,1,0,8,0,1
2,1.067599,0.796300,3,4,1,1,0,0,1,1,11,0,0
3,0.549787,-0.602415,1,4,1,1,0,0,0,1,8,1,0
4,-0.485838,1.710593,2,4,1,0,0,0,0,0,9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
282468,-0.175151,0.097239,4,4,1,0,0,0,0,1,1,0,1
282469,1.067599,-0.602415,1,3,1,0,0,0,1,1,9,0,1
282470,-1.314338,-1.113519,4,0,1,0,0,1,1,0,2,1,1
282471,1.378287,-0.037356,4,4,1,0,0,0,0,1,9,0,1


In [4]:
cvd.shape

(282473, 13)

In [5]:
cvd['Heart_Disease'].value_counts()

Heart_Disease
0    259799
1     22674
Name: count, dtype: int64

##### Separated the data into X and Y

In [6]:
x = cvd.drop('Heart_Disease', axis=1)
y = cvd['Heart_Disease']

##### Since Target Column Imbalance, we do oversampling

In [7]:
import imblearn
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x, y)

In [8]:
x=x_ros
y=y_ros

In [9]:
y.value_counts()

Heart_Disease
1    259799
0    259799
Name: count, dtype: int64

In [10]:
print(x)

        Height_(cm)  Weight_(kg)  General_Health  Checkup  Exercise  Cancer  \
0         -0.485838    -0.171950               4        4         0       0   
1         -0.692963     0.500429               4        4         1       0   
2          1.067599     0.796300               3        4         1       0   
3          0.549787    -0.602415               1        4         1       0   
4         -0.485838     1.710593               2        4         1       0   
...             ...          ...             ...      ...       ...     ...   
519593    -1.521463     0.500429               2        4         1       0   
519594    -1.832150     0.365835               4        4         0       0   
519595     1.585411     0.581067               4        4         1       0   
519596     0.031974    -2.404321               3        4         0       0   
519597     0.860474     1.441997               1        4         1       1   

        Depression  Diabetes  Sex  Age_Category  Sm

In [11]:
print(y)

0         1
1         0
2         1
3         1
4         0
         ..
519593    1
519594    1
519595    1
519596    1
519597    1
Name: Heart_Disease, Length: 519598, dtype: int64


##### Split of the Data into Train and Test

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, stratify=y, random_state=42)

##### Training the Model on random forest ML - Algorithm¶

In [13]:
rf = RandomForestClassifier()

In [14]:
rf.fit(x_train, y_train)

#### Random Forest: Model Evaluation

##### accuracy score on the train data


In [15]:
y_pred = rf.predict(x_train)
Train_Accuracy = accuracy_score(y_pred, y_train)

In [16]:
print('Accuracy score of the training data : ', round(Train_Accuracy*100,2),"%")

Accuracy score of the training data :  98.43 %


##### accuracy score on the test data

In [17]:
y_pred_on_test_data = rf.predict(x_test)
Test_Accuracy = accuracy_score(y_pred_on_test_data, y_test)

In [18]:
print('Accuracy score of the test data : ', round(Test_Accuracy*100,2),"%")

Accuracy score of the test data :  95.15 %


In [19]:
print(x_test)

        Height_(cm)  Weight_(kg)  General_Health  Checkup  Exercise  Cancer  \
235357     1.378287     1.710593               1        4         1       1   
496520    -0.175151    -0.306545               4        4         1       1   
317831    -1.314338    -0.844330               2        4         1       0   
37801      0.549787     0.069964               4        4         1       1   
146608    -0.175151    -1.677985               0        3         1       0   
...             ...          ...             ...      ...       ...     ...   
282337    -0.175151    -1.059562               4        4         1       0   
272322    -0.485838    -0.441139               4        4         1       1   
187840     0.342662    -0.548459               4        4         1       0   
14248     -0.485838     0.635023               1        4         0       1   
45090     -2.039275    -0.171950               2        4         0       0   

        Depression  Diabetes  Sex  Age_Category  Sm

In [20]:
print('Actual Values of Y Test are: \n',np.array(y_test))

Actual Values of Y Test are: 
 [1 1 1 ... 0 0 0]


In [21]:
print('Predicted Values of Y Test are: \n', np.array(y_pred_on_test_data))

Predicted Values of Y Test are: 
 [1 1 1 ... 0 0 0]


##### Making a Predictive System

In [22]:
input_data = [1.378,1.710,1,4,1,1,1,1,1,11,1,0]

# changing the input_data list into numpy array
input_data_as_nparray = np.array(input_data)

# reshape the array into 1 row and all columns-type,  as we are predicting for one instance
input_data_reshaped = input_data_as_nparray.reshape(1,-1)

print(input_data_reshaped)

[[ 1.378  1.71   1.     4.     1.     1.     1.     1.     1.    11.
   1.     0.   ]]


In [23]:
prediction = rf.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('Risk of cardiovascular disease')
else:
  print('No risk of cardiovascular disease')

[1]
No risk of cardiovascular disease


##### Saving the trained model

In [24]:
import pickle

In [25]:
filename = 'cvd_model.sav'
pickle.dump(rf, open(filename, 'wb'))

##### Loading the trained model

In [26]:
loaded_model = pickle.load(open('cvd_model.sav', 'rb'))

In [27]:


input_data = (-2.039, -0.171, 2,4,0,0,1,0,0,12,1,1)

# changing the input_data to numpy array
input_data_as_nparray = np.array(input_data)

# reshape the array into 1 row and all columns-type,  as we are predicting for one instance
input_data_reshaped = input_data_as_nparray.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('Risk of cardiovascular disease')
else:
  print('No risk of cardiovascular disease')

[0]
Risk of cardiovascular disease
