In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load the training data
try:
    train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Heart_Attack_training_dataset.csv")
    print("Training data loaded successfully.")
    print("Training data head:")
    display(train_df.head())
    #print("\nTraining data info:")
    #display(train_df.info())
except FileNotFoundError:
    print("Error: train.csv not found. Please make sure the file is in the correct directory.")

# Load the testing data
try:
    test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Hear_Attack_evaluation_dataset.csv")
    print("\nTesting data loaded successfully.")
    print("Testing data head:")
    display(test_df.head())
    #print("\nTesting data info:")
    #display(test_df.info())
except FileNotFoundError:
    print("Error: test.csv not found. Please make sure the file is in the correct directory.")

Training data loaded successfully.
Training data head:


Unnamed: 0,patient_id,age,sex,chol,bp,hr,diabetes,family_history,smoking,obesity,...,sedentary_hr,income,bmi,triglycerides,phys_act_days,sleep_hr,country,continent,hemisphere,heart_attack_risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0



Testing data loaded successfully.
Testing data head:


Unnamed: 0,patient_id,age,sex,chol,bp,hr,diabetes,family_history,smoking,obesity,...,stress_lvl,sedentary_hr,income,bmi,triglycerides,phys_act_days,sleep_hr,country,continent,hemisphere
0,VRK5064,36,Male,164,118/103,46,1,1,1,0,...,3,5.808121,291026,32.636491,320,3,4,South Korea,Asia,Northern Hemisphere
1,NEN2365,66,Male,355,158/89,49,0,0,1,1,...,9,3.108304,96002,26.396081,709,0,7,United Kingdom,Europe,Northern Hemisphere
2,KXT2493,59,Female,370,172/81,104,0,0,1,0,...,6,8.220722,277303,18.920591,53,4,5,United States,North America,Northern Hemisphere
3,TKO0406,88,Male,296,178/67,53,1,0,1,1,...,6,5.974984,122979,25.688879,86,1,10,New Zealand,Australia,Southern Hemisphere
4,GDP2405,74,Male,294,130/67,105,1,1,1,0,...,1,10.180945,118594,39.937584,138,7,9,Japan,Asia,Northern Hemisphere


## Evaluation and Prediction

## Model Selection and Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_processed_df, y_train)

print("Model training complete.")

Model training complete.


In [95]:
# Make predictions on the test data
if X_test_processed_df is not None:
    test_predictions = model.predict(X_test_processed_df)
    #print("\nTest Predictions:")
    #display(test_predictions)
else:
    print("\nTest data was not processed. Cannot make predictions.")

In [None]:
# Make predictions on the training data
y_train_pred = model.predict(X_train_processed_df)

# Evaluate the model on the training data
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)

print("Training Set Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Training Set Evaluation:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000


## Preprocessing

In [None]:
# Check for missing values
print("Missing values in training data:")
display(train_df.isnull().sum())

print("\nMissing values in testing data:")
if 'test_df' in globals():
    display(test_df.isnull().sum())
else:
    print("test_df is not defined. Please load the test data.")

Missing values in training data:


Unnamed: 0,0
patient_id,0
age,0
sex,0
chol,0
bp,0
hr,0
diabetes,0
family_history,0
smoking,0
obesity,0



Missing values in testing data:


Unnamed: 0,0
patient_id,0
age,0
sex,0
chol,0
bp,0
hr,0
diabetes,0
family_history,0
smoking,0
obesity,0


There are no missing values in either dataset, so we can proceed with encoding and scaling.

In [None]:
# Identify categorical and numerical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Exclude 'patient_id' and 'heart_attack_risk' from the lists
categorical_cols.remove('patient_id')
if 'heart_attack_risk' in numerical_cols:
    numerical_cols.remove('heart_attack_risk')

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['sex', 'bp', 'diet', 'country', 'continent', 'hemisphere']
Numerical columns: ['age', 'chol', 'hr', 'diabetes', 'family_history', 'smoking', 'obesity', 'alcohol', 'exercise_hr_wk', 'prev_heart_prob', 'med_use', 'stress_lvl', 'sedentary_hr', 'income', 'bmi', 'triglycerides', 'phys_act_days', 'sleep_hr']


Let's inspect the unique values in the categorical columns to understand how to encode them.

In [None]:
for col in categorical_cols:
    print(f"\nUnique values in {col}:")
    display(train_df[col].unique())


Unique values in sex:


array(['Male', 'Female'], dtype=object)


Unique values in bp:


array(['158/88', '165/93', '174/99', ..., '137/94', '94/76', '119/67'],
      dtype=object)


Unique values in diet:


array(['Average', 'Unhealthy', 'Healthy'], dtype=object)


Unique values in country:


array(['Argentina', 'Canada', 'France', 'Thailand', 'Japan', 'Brazil',
       'South Africa', 'Vietnam', 'China', 'Italy', 'United States',
       'Spain', 'India', 'Nigeria', 'New Zealand', 'South Korea',
       'Germany', 'Australia', 'Colombia', 'United Kingdom'], dtype=object)


Unique values in continent:


array(['South America', 'North America', 'Europe', 'Asia', 'Africa',
       'Australia'], dtype=object)


Unique values in hemisphere:


array(['Southern Hemisphere', 'Northern Hemisphere'], dtype=object)

- 'sex' and 'diet' appear to be binary and can be label encoded.
- 'bp' is not in a standard numerical format and needs to be split and converted.
- 'country', 'continent', and 'hemisphere' are nominal and can be one-hot encoded.

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd # Import pandas

# Handle 'bp' column: Split into systolic and diastolic pressure
def split_bp(df):
    if 'bp' in df.columns:
        bp_split = df['bp'].str.split('/', expand=True)
        df['systolic_bp'] = pd.to_numeric(bp_split[0])
        df['diastolic_bp'] = pd.to_numeric(bp_split[1])
        df = df.drop('bp', axis=1)
    return df

# Ensure train_df and test_df are loaded before proceeding
if 'train_df' not in globals():
    print("Error: train_df not found. Please load the training data.")
elif 'test_df' not in globals():
    print("Error: test_df not found. Please load the testing data.")
else:
    train_df = split_bp(train_df)
    test_df = split_bp(test_df)

    # Update numerical and categorical columns after splitting 'bp'
    categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if 'patient_id' in categorical_cols:
        categorical_cols.remove('patient_id')
    if 'heart_attack_risk' in numerical_cols:
        numerical_cols.remove('heart_attack_risk')
    if 'systolic_bp' not in numerical_cols:
        numerical_cols.append('systolic_bp')
    if 'diastolic_bp' not in numerical_cols:
        numerical_cols.append('diastolic_bp')


    # Remove duplicates from numerical_cols
    numerical_cols = list(dict.fromkeys(numerical_cols))


    print("Updated categorical columns:", categorical_cols)
    print("Updated numerical columns:", numerical_cols)

    # Separate features and target
    X_train = train_df.drop('heart_attack_risk', axis=1)
    y_train = train_df['heart_attack_risk']
    X_test = test_df.copy() # Test set has no target

    # Apply label encoding for 'sex' and 'diet'
    label_encoder_sex = LabelEncoder()
    label_encoder_diet = LabelEncoder()

    X_train['sex'] = label_encoder_sex.fit_transform(X_train['sex'])
    X_train['diet'] = label_encoder_diet.fit_transform(X_train['diet'])

    X_test['sex'] = label_encoder_sex.transform(X_test['sex'])
    X_test['diet'] = label_encoder_diet.transform(X_test['diet'])


    # Define preprocessing steps for scaling and one-hot encoding
    # Exclude 'sex' and 'diet' from one-hot encoding as they are already label encoded
    # Exclude 'patient_id' as it will be dropped
    features_to_onehot = [col for col in categorical_cols if col not in ['sex', 'diet']]
    features_to_scale = [col for col in numerical_cols if col not in ['sex', 'diet']]

    preprocessor = ColumnTransformer(
        transformers=[
            ('drop_id', 'drop', ['patient_id']),
            ('onehot', OneHotEncoder(handle_unknown='ignore'), features_to_onehot),
            ('scaler', StandardScaler(), features_to_scale)
        ],
        remainder='passthrough' # Keep other columns (like sex and diet)
    )

    # Apply the rest of the preprocessing steps (one-hot and scaling)
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Get feature names after one-hot encoding and scaling
    onehot_cols = preprocessor.named_transformers_['onehot'].get_feature_names_out(features_to_onehot)
    scaled_cols = features_to_scale
    # The remaining columns are 'sex' and 'diet' which were passed through
    remaining_cols = ['sex', 'diet']
    processed_cols = np.concatenate([onehot_cols, scaled_cols, remaining_cols])


    X_train_processed_df = pd.DataFrame(X_train_processed, columns=processed_cols)
    X_test_processed_df = pd.DataFrame(X_test_processed, columns=processed_cols)


    print("\nProcessed training data head:")
    display(X_train_processed_df.head())

    print("\nProcessed testing data head:")
    display(X_test_processed_df.head())

Updated categorical columns: ['sex', 'diet', 'country', 'continent', 'hemisphere']
Updated numerical columns: ['age', 'chol', 'hr', 'diabetes', 'family_history', 'smoking', 'obesity', 'alcohol', 'exercise_hr_wk', 'prev_heart_prob', 'med_use', 'stress_lvl', 'sedentary_hr', 'income', 'bmi', 'triglycerides', 'phys_act_days', 'sleep_hr', 'systolic_bp', 'diastolic_bp']

Processed training data head:


Unnamed: 0,country_Argentina,country_Australia,country_Brazil,country_Canada,country_China,country_Colombia,country_France,country_Germany,country_India,country_Italy,...,sedentary_hr,income,bmi,triglycerides,phys_act_days,sleep_hr,systolic_bp,diastolic_bp,sex,diet
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.184261,1.277669,0.373557,-0.591522,-1.530395,-0.514482,0.868028,0.190418,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.292294,1.579539,-0.268484,-0.819424,-1.091494,-0.011249,1.133997,0.531022,1.0,2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.006177,0.954017,-0.113113,0.753549,0.22521,-1.520949,1.475957,0.939746,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.482617,-0.404449,1.198766,-0.180404,-0.213691,-1.520949,1.058006,1.007867,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.287403,0.028149,-1.120975,-0.837299,-1.091494,-1.017715,-1.677672,0.190418,1.0,2.0



Processed testing data head:


Unnamed: 0,country_Argentina,country_Australia,country_Brazil,country_Canada,country_China,country_Colombia,country_France,country_Germany,country_India,country_Italy,...,sedentary_hr,income,bmi,triglycerides,phys_act_days,sleep_hr,systolic_bp,diastolic_bp,sex,diet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.048566,1.644686,0.592821,-0.439587,-0.213691,-1.520949,-0.651793,1.212229,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.827602,-0.771664,-0.394936,1.298728,-1.530395,-0.011249,0.868028,0.258539,1.0,2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.647593,1.474658,-1.578187,-1.632723,0.22521,-1.017715,1.399965,-0.286427,0.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000417,-0.437418,-0.506875,-1.485257,-1.091494,1.498451,1.627939,-1.240117,1.0,2.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.213218,-0.491749,1.748468,-1.252886,1.541914,0.995218,-0.195847,-1.240117,1.0,2.0


In [None]:
print(model)

RandomForestClassifier(random_state=42)


In [None]:
import joblib
joblib.dump(model, "heart_attack_model.pkl")
loaded_model = joblib.load("heart_attack_model.pkl")

from google.colab import files
files.download("heart_attack_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>