# Home Assignment - Data Science

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report

## Use Random Seed = 42

In [4]:
RANDOM_SEED = 42

## 1. Load the data

In [5]:
data = pd.read_csv('./US_Heart_Patients.csv')

## 2. Perform the exploratory data analysis

### First 10 rows of the data

In [6]:
print("First 10 rows of the data:")
print(data.head(10))

First 10 rows of the data:
   Gender   age  education  currentSmoker  cigsPerDay  BP Meds  \
0    Male  39.0        4.0            0.0         0.0      0.0   
1  Female  46.0        2.0            0.0         0.0      0.0   
2    Male  48.0        1.0            1.0        20.0      0.0   
3  Female  61.0        3.0            1.0        30.0      0.0   
4  Female  46.0        3.0            1.0        23.0      0.0   
5  Female  43.0        2.0            0.0         0.0      0.0   
6  Female  63.0        1.0            0.0         0.0      0.0   
7  Female  45.0        2.0            1.0        20.0      0.0   
8    Male  52.0        1.0            0.0         0.0      0.0   
9    Male  43.0        1.0            1.0        30.0      0.0   

   prevalentStroke  prevalentHyp  diabetes  tot cholesterol  Systolic BP  \
0              0.0           0.0       0.0            195.0        106.0   
1              0.0           0.0       0.0            250.0        121.0   
2              0.0

### 5-point summary

In [7]:
print("\n5-point summary:")
print(data.describe())


5-point summary:
               age    education  currentSmoker   cigsPerDay      BP Meds  \
count  4238.000000  4130.000000    4237.000000  4209.000000  4180.000000   
mean     49.579283     1.979903       0.494218     9.001901     0.029665   
std       8.572875     1.019943       0.500026    11.920742     0.169682   
min      32.000000     1.000000       0.000000     0.000000     0.000000   
25%      42.000000     1.000000       0.000000     0.000000     0.000000   
50%      49.000000     2.000000       0.000000     0.000000     0.000000   
75%      56.000000     3.000000       1.000000    20.000000     0.000000   
max      70.000000     4.000000       1.000000    70.000000     1.000000   

       prevalentStroke  prevalentHyp     diabetes  tot cholesterol  \
count      4231.000000   4238.000000  4238.000000      4180.000000   
mean          0.005909      0.310524     0.025720       236.677273   
std           0.076650      0.462763     0.158316        44.616098   
min           0.0

### Information about the column (data types)

In [8]:
print("\nInformation about the columns:")
print(data.info())


Information about the columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           4232 non-null   object 
 1   age              4238 non-null   float64
 2   education        4130 non-null   float64
 3   currentSmoker    4237 non-null   float64
 4   cigsPerDay       4209 non-null   float64
 5   BP Meds          4180 non-null   float64
 6   prevalentStroke  4231 non-null   float64
 7   prevalentHyp     4238 non-null   float64
 8   diabetes         4238 non-null   float64
 9   tot cholesterol  4180 non-null   float64
 10  Systolic BP      4236 non-null   float64
 11  Diastolic BP     4235 non-null   float64
 12  BMI              4216 non-null   float64
 13  heartRate        4236 non-null   float64
 14  glucose          3849 non-null   float64
 15  Heart-Att        4240 non-null   int64  
dtypes: float64(14), int64(1), ob

### Number of outliers (using IQR method)

In [9]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum().sum()
print(f"\nNumber of outliers: {outliers}")

TypeError: unsupported operand type(s) for -: 'str' and 'str'

### Any missing value

In [None]:
missing_values = data.isnull().sum().sum()
print(f"\nNumber of missing values: {missing_values}")

### Correlation between variables

In [None]:
print("\nCorrelation matrix:")
print(data.corr())

### Distribution of the data

In [None]:
print("\nDistribution of the data:")
print(data.describe(include='all'))

### Draw charts and graphs

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

plt.figure(figsize=(15, 10))
data.hist(bins=50, figsize=(20, 15))
plt.show()

## 3. Data Preprocessing

### Impute missing values (if any)

In [None]:
data.fillna(data.mean(), inplace=True)

### Outlier treatment (if any)

In [None]:
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

### Encode categorical features if needed

In [None]:
data = pd.get_dummies(data, drop_first=True)

## 4. Split the dataset

In [None]:
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## 5. Model preparation and evaluation

### Naive Bayes

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_train_pred_nb = nb_model.predict(X_train)
y_test_pred_nb = nb_model.predict(X_test)

### Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=RANDOM_SEED)
dt_model.fit(X_train, y_train)
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)

### Calculate F1 score

In [None]:
f1_train_nb = f1_score(y_train, y_train_pred_nb)
f1_test_nb = f1_score(y_test, y_test_pred_nb)
f1_train_dt = f1_score(y_train, y_train_pred_dt)
f1_test_dt = f1_score(y_test, y_test_pred_dt)

In [None]:
print(f"Naive Bayes - F1 Score (Train): {f1_train_nb}, F1 Score (Test): {f1_test_nb}")
print(f"Decision Tree - F1 Score (Train): {f1_train_dt}, F1 Score (Test): {f1_test_dt}")

### Pick and explain the best model

In [None]:
best_model = 'Naive Bayes' if f1_test_nb > f1_test_dt else 'Decision Tree'
print(f"\nBest model: {best_model}")

In [None]:
if best_model == 'Naive Bayes':
    y_pred_best = y_test_pred_nb
    model = nb_model
else:
    y_pred_best = y_test_pred_dt
    model = dt_model

### Confusion matrix and classification report

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_best)
class_report = classification_report(y_test, y_pred_best)

In [None]:
print("\nConfusion Matrix:")
print(conf_matrix)

In [None]:
print("\nClassification Report:")
print(class_report)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title(f"{best_model} - Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()