## Data preperation

In [1]:
import pandas as pd

file_path = '/home/saroj/Desktop/DSSES/heart_disease.csv'
df = pd.read_csv(file_path)

print(df.head())  # Display the first few rows of the dataframe

# Encoding the categorical values
df['HeartDisease'] = df['HeartDisease'].apply(lambda x: 1 if x == 'Yes' else 0)

# Separate features and target
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# List of numerical and categorical features
numerical_features = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
categorical_features = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']


  HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  16.60     Yes              No     No             3.0   
1           No  20.34      No              No    Yes             0.0   
2           No  26.58     Yes              No     No            20.0   
3           No  24.21      No              No     No             0.0   
4           No  23.71      No              No     No            28.0   

   MentalHealth DiffWalking     Sex  AgeCategory   Race Diabetic  \
0          30.0          No  Female        55-59  White      Yes   
1           0.0          No  Female  80 or older  White       No   
2          30.0          No    Male        65-69  White      Yes   
3           0.0          No  Female        75-79  White       No   
4           0.0         Yes  Female        40-44  White       No   

  PhysicalActivity  GenHealth  SleepTime Asthma KidneyDisease SkinCancer  
0              Yes  Very good        5.0    Yes            No        Yes  
1       

### Data preprocesiing

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Create Transformers for Preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine Transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### Model Training

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that combines preprocessing and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=10, warm_start=True))
])

# Train the model
print("Training the model...")
model_pipeline.fit(X_train, y_train)
print("Training complete.")

# Evaluate the model
y_pred = model_pipeline.predict(X_test)
y_prob = model_pipeline.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Training the model...
Training complete.

Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.97      0.95     58367
           1       0.32      0.13      0.18      5592

    accuracy                           0.90     63959
   macro avg       0.62      0.55      0.57     63959
weighted avg       0.87      0.90      0.88     63959



### Save the model

In [4]:
import pickle

# Save the trained model as a pickle file
with open('heart_disease_model.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

print("Model saved as heart_disease_model.pkl")


Model saved as heart_disease_model.pkl


### Load the model 

In [5]:
import pickle

# Load the model from the pickle file
with open('heart_disease_model.pkl', 'rb') as f:
    model = pickle.load(f)


### Provide Input and Make Prediction

In [7]:
new_data = {
    'BMI': 16.6,
    'PhysicalHealth': 3,
    'MentalHealth': 30,
    'SleepTime': 5,
    'Smoking': 'Yes',
    'AlcoholDrinking': 'No',
    'Stroke': 'No',
    'DiffWalking': 'No',
    'Sex': 'Female',
    'AgeCategory': 'Middle-aged',
    'Race': 'White',
    'Diabetic': 'No',
    'PhysicalActivity': 'Yes',
    'GenHealth': 'Good',
    'Asthma': 'No',
    'KidneyDisease': 'No',
    'SkinCancer': 'Yes'
}

# Create a DataFrame from the new data
new_df = pd.DataFrame([new_data])

# Make a prediction
prediction = model.predict(new_df)[0]
probability = model.predict_proba(new_df)[0][1]  # Probability of having heart disease

# Display the prediction
if prediction == 1:
    print("The patient is predicted to have heart disease.")
else:
    print("The patient is predicted to not have heart disease.")

print(f"Probability of having heart disease: {probability:.2f}")


The patient is predicted to not have heart disease.
Probability of having heart disease: 0.20
