Group 121:
* Juan Manuel Rodriguez
* Vladyslav Horbatenko
* Aryan Mirzazadeh

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [21]:
df_heart = pd.read_csv(
    "https://hastie.su.domains/ElemStatLearn/datasets/SAheart.data",
    sep=",",
    header=0,
    index_col=0,
)
columns_ordered = [
    "sbp",
    "ldl",
    "adiposity",
    "obesity",
    "typea",
    "age",
    "tobacco",
    "alcohol",
    "famhist",
    "chd",
]
missing = [c for c in columns_ordered if c not in df_heart.columns]
if missing:
    raise KeyError(f"Missing columns in df_heart: {missing}")

df_heart = df_heart[columns_ordered]
df_heart.head()

Unnamed: 0_level_0,sbp,ldl,adiposity,obesity,typea,age,tobacco,alcohol,famhist,chd
row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,160,5.73,23.11,25.3,49,52,12.0,97.2,Present,1
2,144,4.41,28.61,28.87,55,63,0.01,2.06,Absent,1
3,118,3.48,32.28,29.14,52,46,0.08,3.81,Present,0
4,170,6.41,38.03,31.99,51,58,7.5,24.26,Present,1
5,134,3.5,27.78,25.99,60,49,13.6,57.34,Present,1


In [22]:
# standarize all columns except famhist
df_heart["famhist"] = df_heart["famhist"].map({"Present": 1, "Absent": 0})

columns_to_log_transform = ["tobacco", "alcohol"]
for column in columns_to_log_transform:
    df_heart[column] = np.log1p(df_heart[column])

df_heart_standarized = (df_heart - df_heart.mean()) / df_heart.std()
df_heart_standarized["famhist"] = df_heart["famhist"]


# Extract Y before standardization
Y = df_heart["chd"]

df_heart_standarized = (df_heart - df_heart.mean()) / df_heart.std()
df_heart_standarized["famhist"] = df_heart["famhist"]

# Now use the original Y
X = df_heart_standarized.drop(columns=["chd"])

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [26]:
from sklearn.naive_bayes import GaussianNB


clf = GaussianNB()
clf.fit(X_train, Y_train)
print("GaussianNB training accuracy:", clf.score(X_test, Y_test))

GaussianNB training accuracy: 0.6989247311827957


In [27]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()
clf.fit(X_train, Y_train)
print("BernoulliNB training accuracy:", clf.score(X_test, Y_test))

BernoulliNB training accuracy: 0.6451612903225806


In [31]:
# Scale data for MultinomialNB (requires non-negative values)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
from sklearn import naive_bayes as nb

clf = nb.MultinomialNB()
clf.fit(X_train_scaled, Y_train)
print("MultinomialNB training accuracy:", clf.score(X_test_scaled, Y_test))

MultinomialNB training accuracy: 0.6344086021505376
