In [1]:
import pandas as pd
df = pd.read_csv('/content/heart_disease_uci.csv')
print(df.head())

   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0  


In [2]:
print("DataFrame Head:")
print(df.head())

print("\nDataFrame Info:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())

DataFrame Head:
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal

In [3]:
print("Missing values before handling:")
print(df.isnull().sum())

Missing values before handling:
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [4]:
df = df.drop('id', axis=1)
print("Dropped 'id' column. Current DataFrame columns:")
print(df.columns)

Dropped 'id' column. Current DataFrame columns:
Index(['age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')


In [5]:
for col in ['trestbps', 'chol', 'thalch', 'oldpeak']:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Missing values in '{col}' filled with median: {median_val}")


Missing values in 'trestbps' filled with median: 130.0
Missing values in 'chol' filled with median: 223.0
Missing values in 'thalch' filled with median: 140.0
Missing values in 'oldpeak' filled with median: 0.5


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


In [6]:
for col in ['trestbps', 'chol', 'thalch', 'oldpeak']:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Missing values in '{col}' filled with median: {median_val}")

In [7]:
for col in ['fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']:
    if df[col].isnull().any():
        # Use .mode()[0] to get the first mode in case of multiple modes
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)
        print(f"Missing values in '{col}' filled with mode: {mode_val}")

Missing values in 'fbs' filled with mode: False
Missing values in 'restecg' filled with mode: normal
Missing values in 'exang' filled with mode: False
Missing values in 'slope' filled with mode: flat
Missing values in 'ca' filled with mode: 0.0
Missing values in 'thal' filled with mode: normal


  df[col] = df[col].fillna(mode_val)


In [8]:
for col in ['fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']:
    if df[col].isnull().any():
        # Use .mode()[0] to get the first mode in case of multiple modes
        mode_val = df[col].mode()[0]
        # Fill missing values and then explicitly infer object types to handle potential downcasting warnings
        df[col] = df[col].fillna(mode_val).infer_objects(copy=False)
        print(f"Missing values in '{col}' filled with mode: {mode_val}")

In [9]:
print("Missing values after handling:")
print(df.isnull().sum())

Missing values after handling:
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


In [10]:
print("Number of duplicate rows before removal:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Number of duplicate rows after removal:", df.duplicated().sum())

Number of duplicate rows before removal: 2
Number of duplicate rows after removal: 0


In [11]:
print("Cleaned DataFrame Head:")
print(df.head())

print("\nCleaned DataFrame Info:")
df.info()

Cleaned DataFrame Head:
   age     sex    dataset               cp  trestbps   chol    fbs  \
0   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0  

Cleaned

In [12]:
import pandas as pd
import os

# Define output directory
output_dir = './output'
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory '{output_dir}' ensured.")

# Load the dataset
df = pd.read_csv('/content/heart_disease_uci.csv')

# Display the first few rows to verify successful loading
print("\nDataFrame loaded successfully. First 5 rows:")
print(df.head())

Output directory './output' ensured.

DataFrame loaded successfully. First 5 rows:
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  revers

In [13]:
df['target'] = (df['num'] > 0).astype(int)
df = df.drop(['id', 'dataset', 'num'], axis=1)
print("DataFrame after creating 'target' and dropping specified columns:")
print(df.head())

DataFrame after creating 'target' and dropping specified columns:
   age     sex               cp  trestbps   chol    fbs         restecg  \
0   63    Male   typical angina     145.0  233.0   True  lv hypertrophy   
1   67    Male     asymptomatic     160.0  286.0  False  lv hypertrophy   
2   67    Male     asymptomatic     120.0  229.0  False  lv hypertrophy   
3   37    Male      non-anginal     130.0  250.0  False          normal   
4   41  Female  atypical angina     130.0  204.0  False  lv hypertrophy   

   thalch  exang  oldpeak        slope   ca               thal  target  
0   150.0  False      2.3  downsloping  0.0       fixed defect       0  
1   108.0   True      1.5         flat  3.0             normal       1  
2   129.0   True      2.6         flat  2.0  reversable defect       1  
3   187.0  False      3.5  downsloping  0.0             normal       0  
4   172.0  False      1.4    upsloping  0.0             normal       0  


In [14]:
for col in ['trestbps', 'chol', 'thalch', 'oldpeak']:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Missing values in '{col}' filled with median: {median_val}")

Missing values in 'trestbps' filled with median: 130.0
Missing values in 'chol' filled with median: 223.0
Missing values in 'thalch' filled with median: 140.0
Missing values in 'oldpeak' filled with median: 0.5


In [15]:
for col in ['fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']:
    if df[col].isnull().any():
        # Use .mode()[0] to get the first mode in case of multiple modes
        mode_val = df[col].mode()[0]
        # Fill missing values and then explicitly infer object types to handle potential downcasting warnings
        df[col] = df[col].fillna(mode_val).infer_objects(copy=False)
        print(f"Missing values in '{col}' filled with mode: {mode_val}")

Missing values in 'fbs' filled with mode: False
Missing values in 'restecg' filled with mode: normal
Missing values in 'exang' filled with mode: False
Missing values in 'slope' filled with mode: flat
Missing values in 'ca' filled with mode: 0.0
Missing values in 'thal' filled with mode: normal


  df[col] = df[col].fillna(mode_val).infer_objects(copy=False)


In [16]:
print("Missing values after handling:")
print(df.isnull().sum())

Missing values after handling:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [17]:
cleaned_filepath = os.path.join(output_dir, 'heart_clean.csv')
df.to_csv(cleaned_filepath, index=False)
print(f"Cleaned DataFrame saved to '{cleaned_filepath}'")

Cleaned DataFrame saved to './output/heart_clean.csv'


In [18]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("DataFrame after one-hot encoding categorical features:")
print(df.head())

DataFrame after one-hot encoding categorical features:
   age  trestbps   chol  thalch  oldpeak   ca  target  sex_Male  \
0   63     145.0  233.0   150.0      2.3  0.0       0      True   
1   67     160.0  286.0   108.0      1.5  3.0       1      True   
2   67     120.0  229.0   129.0      2.6  2.0       1      True   
3   37     130.0  250.0   187.0      3.5  0.0       0      True   
4   41     130.0  204.0   172.0      1.4  0.0       0     False   

   cp_atypical angina  cp_non-anginal  cp_typical angina  fbs_True  \
0               False           False               True      True   
1               False           False              False     False   
2               False           False              False     False   
3               False            True              False     False   
4                True           False              False     False   

   restecg_normal  restecg_st-t abnormality  exang_True  slope_flat  \
0           False                     False       

In [19]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns to scale (excluding 'target')
numerical_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply scaling to numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("DataFrame after scaling numerical features:")
print(df.head())

DataFrame after scaling numerical features:
        age  trestbps      chol    thalch   oldpeak        ca  target  \
0  1.007386  0.705176  0.303643  0.489727  1.368109 -0.361400       0   
1  1.432034  1.518569  0.789967 -1.181478  0.611589  4.411152       1   
2  1.432034 -0.650479  0.266939 -0.345875  1.651804  2.820301       1   
3 -1.752828 -0.108217  0.459634  1.961979  2.502889 -0.361400       0   
4 -1.328180 -0.108217  0.037541  1.365120  0.517024 -0.361400       0   

   sex_Male  cp_atypical angina  cp_non-anginal  cp_typical angina  fbs_True  \
0      True               False           False               True      True   
1      True               False           False              False     False   
2      True               False           False              False     False   
3      True               False            True              False     False   
4     False                True           False              False     False   

   restecg_normal  restecg_st-t abno

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and testing sets.")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Initialize and train the RandomForestClassifier model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

print("RandomForestClassifier model trained successfully.")

Data split into training and testing sets.
X_train shape: (736, 18), y_train shape: (736,)
X_test shape: (184, 18), y_test shape: (184,)
RandomForestClassifier model trained successfully.


In [21]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Make predictions on the test data
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Model Evaluation:")

# Print Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# Calculate and print Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.79      0.83        82
           1       0.85      0.91      0.88       102

    accuracy                           0.86       184
   macro avg       0.86      0.85      0.86       184
weighted avg       0.86      0.86      0.86       184


ROC-AUC Score: 0.9198

Confusion Matrix:
[[65 17]
 [ 9 93]]


In [27]:
!streamlit run app.py

/bin/bash: line 1: streamlit: command not found


In [28]:
!streamlit run app.py

/bin/bash: line 1: streamlit: command not found


In [24]:
# To run the Streamlit app, execute the following in your terminal:
#  !streamlit run app.py
# This will provide a local URL and a network URL. Use the network URL to access the app in your browser.

In [26]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib # Often used for saving/loading models/scalers
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import os

# --- Streamlit App Setup ---
st.title("Heart Disease Prediction")
st.write("Enter patient information to predict the likelihood of heart disease.")

# --- Simulate loading of pre-trained model and scaler ---
# In a real application, these objects (model, scaler) and the `X_train_columns` list
# would be saved to files (e.g., using joblib) after training and then loaded here.
# For the purpose of this exercise, we will re-create them based on the previous notebook steps
# to ensure the app is self-contained and runnable.

# 1. Load original data
# Assume '/content/heart_disease_uci.csv' is available as per the environment.
original_df = pd.read_csv('/content/heart_disease_uci.csv')

# 2. Prepare target and drop columns ('id', 'dataset', 'num')
df_temp_for_processing = original_df.copy()
df_temp_for_processing['target'] = (df_temp_for_processing['num'] > 0).astype(int)
df_temp_for_processing = df_temp_for_processing.drop(['id', 'dataset', 'num'], axis=1)

# 3. Handle missing values (Numerical with median, Categorical with mode)
# Define columns based on prior analysis
numerical_cols_for_processing = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
categorical_cols_for_processing = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

for col in numerical_cols_for_processing:
    if df_temp_for_processing[col].isnull().any():
        median_val = df_temp_for_processing[col].median()
        df_temp_for_processing[col] = df_temp_for_processing[col].fillna(median_val)

for col in categorical_cols_for_processing:
    if df_temp_for_processing[col].isnull().any():
        mode_val = df_temp_for_processing[col].mode()[0]
        df_temp_for_processing[col] = df_temp_for_processing[col].fillna(mode_val).infer_objects(copy=False)

# 4. Fit StandardScaler on numerical features *before* one-hot encoding for the entire dataset
scaler = StandardScaler()
scaler.fit(df_temp_for_processing[numerical_cols_for_processing])

# 5. Apply one-hot encoding to categorical features for the full processed dataframe
df_processed_final = pd.get_dummies(df_temp_for_processing, columns=categorical_cols_for_processing, drop_first=True)

# 6. Separate features (X) and target (y) for model training
X_full_for_training = df_processed_final.drop('target', axis=1)
y_full_for_training = df_processed_final['target']

# Store the column names for alignment later
X_train_columns = X_full_for_training.columns.tolist()

# 7. Train the RandomForestClassifier model
# In a real app, this would be `model = joblib.load('random_forest_model.pkl')`
model = RandomForestClassifier(random_state=42)
model.fit(X_full_for_training, y_full_for_training)

# --- End of Simulation for Loaded Objects ---

# Define the numerical and categorical columns that were used in feature engineering
# These lists are used for processing user input consistently with training data
numerical_features_app = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
categorical_features_app = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# --- Streamlit Input Fields ---
with st.sidebar:
    st.header("Patient Data Input")

    age = st.slider("Age", 20, 80, 50)
    sex_input = st.radio("Sex", ['Male', 'Female'])
    cp_input = st.selectbox("Chest Pain Type (cp)", ['typical angina', 'asymptomatic', 'non-anginal', 'atypical angina'])
    trestbps = st.slider("Resting Blood Pressure (trestbps)", 90, 200, 120)
    chol = st.slider("Serum Cholestoral (chol)", 100, 600, 200)
    fbs_input = st.checkbox("Fasting Blood Sugar > 120 mg/dl (fbs)", False)
    restecg_input = st.selectbox("Resting Electrocardiographic Results (restecg)", ['normal', 'st-t abnormality', 'lv hypertrophy'])
    thalch = st.slider("Maximum Heart Rate Achieved (thalch)", 70, 210, 150)
    exang_input = st.checkbox("Exercise Induced Angina (exang)", False)
    oldpeak = st.slider("ST depression induced by exercise relative to rest (oldpeak)", 0.0, 6.0, 1.0, 0.1)
    slope_input = st.selectbox("The slope of the peak exercise ST segment (slope)", ['upsloping', 'flat', 'downsloping'])
    ca = st.slider("Number of major vessels (0-3) colored by flourosopy (ca)", 0, 3, 0)
    thal_input = st.selectbox("Thal", ['normal', 'fixed defect', 'reversable defect'])

    predict_button = st.button("Predict Heart Disease")

# --- Prediction Logic ---
if predict_button:
    # 1. Create a Pandas DataFrame from the user's input values.
    user_input_df = pd.DataFrame({
        'age': [age],
        'sex': [sex_input],
        'cp': [cp_input],
        'trestbps': [float(trestbps)], # Ensure type consistency
        'chol': [float(chol)],         # Ensure type consistency
        'fbs': [fbs_input],
        'restecg': [restecg_input],
        'thalch': [float(thalch)],     # Ensure type consistency
        'exang': [exang_input],
        'oldpeak': [float(oldpeak)],   # Ensure type consistency
        'slope': [slope_input],
        'ca': [float(ca)],             # Ensure type consistency
        'thal': [thal_input]
    })

    st.write("### User Input Data:")
    st.dataframe(user_input_df)

    # 2. Apply one-hot encoding to the categorical features in the user input DataFrame.
    user_input_encoded = pd.get_dummies(user_input_df, columns=categorical_features_app, drop_first=True)

    # 3. Align the columns of the one-hot encoded user input DataFrame with the columns of X_train_columns.
    # This ensures that the user input DataFrame has the same columns in the same order as the training data.
    user_input_aligned = user_input_encoded.reindex(columns=X_train_columns, fill_value=0)

    # 4. & 5. Scale the numerical features in the aligned user input DataFrame.
    user_input_aligned[numerical_features_app] = scaler.transform(user_input_aligned[numerical_features_app])

    st.write("### Preprocessed User Input for Prediction:")
    st.dataframe(user_input_aligned)

    # 6. Use the loaded `model` to make a prediction.
    prediction = model.predict(user_input_aligned)
    prediction_proba = model.predict_proba(user_input_aligned)[:, 1] # Probability of heart disease (class 1)

    # 7. Display the prediction result.
    st.write("---")
    st.subheader("Prediction Result:")
    if prediction[0] == 1:
        st.error(f"The model predicts **Heart Disease** with a probability of {prediction_proba[0]:.2f}")
    else:
        st.success(f"The model predicts **No Heart Disease** with a probability of {1 - prediction_proba[0]:.2f}")


Writing app.py
