## Task 1: Import Modules

In [None]:
%pip install -r requirements.txt

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Task 2: Load the Dataset
df = pd.read_csv('CVD.csv', encoding='ISO-8859-1')
df.head()

In [None]:
## Task 3: Create the Pairplot
g = sns.PairGrid(df)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

In [None]:
## Task 4: Plot the Distribution of Categorical Features
categorical_features = df.columns[df.dtypes =='object']
plot_num = 1
plt.figure(figsize=(20,10))
for col in categorical_features:
    ax = plt.subplot(3,4,plot_num)
    sns.histplot(df[col], kde=True)
    plot_num +=1

plt.tight_layout()

In [None]:
## Task 5: Plot the Distribution of Numerical Features
numerical_features = df.columns[df.dtypes !='object']
plot_num = 1
plt.figure(figsize=(20,10))
for col in numerical_features:
    ax = plt.subplot(3,3,plot_num)
    sns.histplot(df[col], kde=True)
    plot_num +=1

plt.tight_layout()

In [None]:
## Task 6: Plot the Relation of Factors with Diseases

#In this task, plot the relation of different factors with heart disease. To complete this task, perform the following steps:
#Select different factors from the dataset.
#Use a loop to iterate through the factors and plot the counterplot of each factor with the disease.

selected_variables = ['General_Health', 'Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Depression']
disease = 'Heart_Disease'
plot_num = 1
plt.figure(figsize=(20,10))
for variable in selected_variables:
    ax = plt.subplot(3,3, plot_num)
    sns.countplot(data=df, x = variable, hue = disease)
    plot_num += 1
plt.tight_layout()

## Task 7: Transform the Categorical Columns

After plotting the distributions, preprocess the data by transforming the categorical data into numerical data. To complete this task, perform the following steps:

Create a copy of the dataset so you donâ€™t lose the actual data.
Get the indexes of the categorical dataset.
Use the fit_transform() method from the LabelEncoder() class to transform the data.
Print the first five rows of the dataset.

In [None]:
data = df.copy()
categorical_features = df.columns[df.dtypes =='object']
le = LabelEncoder()
for i in categorical_features:
    data[i] = le.fit_transform(data[i])
data.head()

In [None]:
## Task 8: Split the Training and Testing Dataset
X = data.drop('Heart_Disease', axis='columns')
y = data['Heart_Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [None]:
## Task 9: Build the Classifier
#After getting the training and testing dataset, build a model using the RandomForestClassifier from sklearn. To complete this task, use the RandomForestClassifier() class to build the model and pass the number of trees in each forest using n_estimator as an argument to this class.
classifier = RandomForestClassifier(n_estimators=500)

In [None]:
## Task 10: Train the Classifier
classifier.fit(X_train, y_train)

In [None]:
## Task 11: Get Predictions
predictions = classifier.predict(X_test)
prediction_probability = classifier.predict_proba(X_test)

In [None]:
## Task 12: Print Confusion Matrix and Accuracy
print("Report: \n", classification_report(y_test, predictions))
print('\n')
print("Roc Score: ", roc_auc_score(y_test, prediction_probability[:,1]))