## Importing essential libraries for machine learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

## Loading CSV file

In [2]:
data = pd.read_csv("cardio_train.csv", sep=";")

In [3]:
data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [4]:
# Assuming 'data' is your DataFrame
# 1. Check for missing values
missing_values = data.isna().sum()
print("Missing values in each column:")
print(missing_values)

# 2. Display rows with missing values
missing_rows = data[data.isna().any(axis=1)]
print("Rows with missing values:")
print(missing_rows)

# 3. Total number of missing values in the DataFrame
total_missing_values = data.isna().sum().sum()
print(f"Total number of missing values in the DataFrame: {total_missing_values}")


Missing values in each column:
id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64
Rows with missing values:
Empty DataFrame
Columns: [id, age, gender, height, weight, ap_hi, ap_lo, cholesterol, gluc, smoke, alco, active, cardio]
Index: []
Total number of missing values in the DataFrame: 0


## Dropping the id column

In [5]:
data.drop("id",axis=1,inplace=True)
data.drop_duplicates(inplace=True)
data

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


## Making new feature called BMI using feature engineering

In [6]:
data["bmi"] = data["weight"] / (data["height"]/100)**2
out_filter = ((data["ap_hi"]>250) | (data["ap_lo"]>200))
data = data[~out_filter]

#changing the age from days to year format
data.loc[:,"age"] = (data["age"]/365).astype(int)

#filtering out ap_hi and ap_lo
out_filter2 = ((data["ap_hi"] < 0) | (data["ap_lo"] < 0))
data = data[~out_filter2]

## Seperating target for the Training and Testing dataset

In [7]:
target_name = 'cardio'
data_target = data[target_name]
data = data.drop([target_name], axis=1)

# seperating test and training set
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=0)

# split training set to validation set
Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.2, random_state=0)

# Model Training Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
import joblib

# Logistic Regression
logreg = LogisticRegression(max_iter=2000)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 splits, shuffled for randomness
cv_scores = cross_val_score(logreg, train, target, cv=kfold)  # Automatically splits and trains

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

# Train the model on the full training set
logreg.fit(train, target)

# Save the model
joblib.dump(logreg, "logreg_model.pkl")

# Predictions
print("Prediction for the first instance:", logreg.predict(train.iloc[[0]]))
model = joblib.load("logreg_model.pkl")
print("Prediction for the second instance:", model.predict(train.iloc[[1]]))

# Training accuracy
training_accuracy = logreg.score(train, target)
print("Training Accuracy:", training_accuracy)


Cross-Validation Scores: [0.72508155 0.72725625 0.72607829 0.72363175 0.72517216]
Mean Cross-Validation Accuracy: 0.7254440014498007
Prediction for the first instance: [1]
Prediction for the second instance: [0]
Training Accuracy: 0.7255708590068866


## Model Training Support Vector Classifier

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score
import joblib

# Support Vector Machines
svc = SVC()

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(svc, train, target, cv=kfold)

# Train the model on the full training set
svc.fit(train, target)

# Training accuracy
acc_svc = round(svc.score(train, target) * 100, 2)

# Test accuracy
acc_test_svc = round(svc.score(test, target_test) * 100, 2)

# Save the model
joblib.dump(svc, "svc_model.pkl")

# Predictions
print("Prediction for the first instance:", svc.predict(train.iloc[[0]]))
model = joblib.load("svc_model.pkl")
print("Prediction for the third instance:", model.predict(train.iloc[[2]]))

# Output results
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", round(cv_scores.mean() * 100, 2))
print("Training Accuracy:", acc_svc)
print("Test Accuracy:", acc_test_svc)

Prediction for the first instance: [1]
Prediction for the third instance: [1]
Cross-Validation Scores: [0.72200072 0.72390359 0.72390359 0.71719826 0.7196448 ]
Mean Cross-Validation Accuracy: 72.13
Training Accuracy: 72.23
Test Accuracy: 72.39


## Model Training  K-Nearest Neighbors Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
import joblib

# k-Nearest Neighbors Algorithm
param_grid = {'n_neighbors': [2, 3]}  # Parameters to tune
knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=10)  # GridSearch with 10-fold CV

# Fit the model
knn.fit(train, target)

# Best parameters from GridSearch
print("Best Parameters:", knn.best_params_)

# Training Accuracy
acc_knn = round(knn.score(train, target) * 100, 2)
print("Training Accuracy:", acc_knn)

# Test Accuracy
acc_test_knn = round(knn.score(test, target_test) * 100, 2)
print("Test Accuracy:", acc_test_knn)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn.best_estimator_, train, target, cv=kfold)  # Use best estimator from GridSearch
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", round(cv_scores.mean() * 100, 2))

# Save the model
joblib.dump(knn, "knn_model.pkl")

# Predictions
model = joblib.load("knn_model.pkl")
print("Prediction for the third instance:", model.predict(train.iloc[[2]]))


Best Parameters: {'n_neighbors': 3}
Training Accuracy: 81.32
Test Accuracy: 67.25
Cross-Validation Scores: [0.67162015 0.66853933 0.67225444 0.6734324  0.6705328 ]
Mean Cross-Validation Accuracy: 67.13
Prediction for the third instance: [1]


## Model Training Decision Tree Classifier

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score
import joblib

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(decision_tree, train, target, cv=kfold)

# Fit the model on the full training set
decision_tree.fit(train, target)

# Training Accuracy
acc_decision_tree = round(decision_tree.score(train, target) * 100, 2)
print("Training Accuracy:", acc_decision_tree)

# Test Accuracy
acc_test_decision_tree = round(decision_tree.score(test, target_test) * 100, 2)
print("Test Accuracy:", acc_test_decision_tree)

# Cross-Validation Results
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", round(cv_scores.mean() * 100, 2))

# Save the model
joblib.dump(decision_tree, "decision_tree_model.pkl")

# Predictions
model = joblib.load("decision_tree_model.pkl")
print("Prediction for the 7th test instance (loaded model):", model.predict(test.iloc[[7]]))
print("Prediction for the 7th test instance (original model):", decision_tree.predict(test.iloc[[7]]))


Training Accuracy: 97.99
Test Accuracy: 64.14
Cross-Validation Scores: [0.63764045 0.62586082 0.63410656 0.64072128 0.64108373]
Mean Cross-Validation Accuracy: 63.59
Prediction for the 7th test instance (loaded model): [1]
Prediction for the 7th test instance (original model): [1]


## Model Training Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
import joblib

# Random Forest Classifier with GridSearchCV
param_grid = {'n_estimators': [100, 300]}  # Parameters to tune
random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

# Fit the model
random_forest.fit(train, target)

# Best parameters from GridSearchCV
print("Best Parameters:", random_forest.best_params_)

# Training Accuracy
acc_random_forest = round(random_forest.score(train, target) * 100, 2)
print("Training Accuracy:", acc_random_forest)

# Test Accuracy
acc_test_random_forest = round(random_forest.score(test, target_test) * 100, 2)
print("Test Accuracy:", acc_test_random_forest)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(random_forest.best_estimator_, train, target, cv=kfold)  # Use best estimator
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", round(cv_scores.mean() * 100, 2))

# Save the model
joblib.dump(random_forest, "random_forest_model.pkl")

# Predictions
model = joblib.load("random_forest_model.pkl")
print("Prediction for the 7th test instance (loaded model):", model.predict(test.iloc[[7]]))
print("Prediction for the 7th test instance (original model):", random_forest.predict(test.iloc[[7]]))


Best Parameters: {'n_estimators': 300}
Training Accuracy: 97.99
Test Accuracy: 71.6
Cross-Validation Scores: [0.71049293 0.70750272 0.70677782 0.70777456 0.71040232]
Mean Cross-Validation Accuracy: 70.86
Prediction for the 7th test instance (loaded model): [1]
Prediction for the 7th test instance (original model): [1]


## Model Training XG Boost Classifier

In [13]:
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
import joblib

# Suppress UserWarnings
warnings.filterwarnings('ignore', category=UserWarning)

# XGBoost Classifier with GridSearchCV
param_grid = {'n_estimators': [100, 300], 'learning_rate': [0.01, 0.1, 0.2]}  # Parameters to tune
xgboost = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                       param_grid=param_grid, 
                       cv=5)

# Fit the model
xgboost.fit(train, target)

# Best parameters from GridSearchCV
print("Best Parameters:", xgboost.best_params_)

# Training Accuracy
acc_xgboost = round(xgboost.score(train, target) * 100, 2)
print("Training Accuracy:", acc_xgboost)

# Test Accuracy
acc_test_xgboost = round(xgboost.score(test, target_test) * 100, 2)
print("Test Accuracy:", acc_test_xgboost)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgboost.best_estimator_, train, target, cv=kfold)  # Use best estimator
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", round(cv_scores.mean() * 100, 2))

# Save the model
joblib.dump(xgboost, "xgboost_model.pkl")

# Predictions
model = joblib.load("xgboost_model.pkl")
print("Prediction for the 7th test instance (loaded model):", model.predict(test.iloc[[7]]))
print("Prediction for the 7th test instance (original model):", xgboost.predict(test.iloc[[7]]))


Best Parameters: {'learning_rate': 0.01, 'n_estimators': 300}
Training Accuracy: 74.14
Test Accuracy: 73.66
Cross-Validation Scores: [0.73649873 0.73223994 0.73069953 0.73106198 0.73459587]
Mean Cross-Validation Accuracy: 73.3
Prediction for the 7th test instance (loaded model): [1]
Prediction for the 7th test instance (original model): [1]


## Suggestion Model

In [25]:
# 1. Data Loading and Initial Preprocessing
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset and convert age from days to years
data = pd.read_csv('cardio_train.csv', delimiter=';')
data['age_years'] = (data['age'] / 365).round()

def preprocess_data(data):
    # Helper function to remove outliers using Interquartile Range (IQR) method
    def remove_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        df = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
        return df
    
    # Clean numerical columns by removing statistical outliers
    numerical_cols = ['weight', 'ap_hi', 'ap_lo']
    for col in numerical_cols:
        data = remove_outliers(data, col)
    
    # Remove physiologically impossible blood pressure readings
    data = data[(data['ap_hi'] > 0) & (data['ap_hi'] < 300) &
                (data['ap_lo'] > 0) & (data['ap_lo'] < 200) &
                (data['ap_hi'] > data['ap_lo'])]
    
    return data

def calculate_optimal_ranges(cluster_data):
    optimal_ranges = {}
    
    # Calculate statistical ranges for numerical attributes (weight, blood pressure)
    numerical_attrs = ['weight', 'ap_hi', 'ap_lo']
    for attr in numerical_attrs:
        mean_val = cluster_data[attr].mean()
        std_val = cluster_data[attr].std()
        
        # Create 95% confidence intervals using 1.96 * standard deviation
        min_val = max(cluster_data[attr].min(), mean_val - 1.96 * std_val)
        max_val = min(cluster_data[attr].max(), mean_val + 1.96 * std_val)
        
        # Store statistical measures for each numerical attribute
        optimal_ranges[attr] = {
            'recommended_range': f"{min_val:.1f} - {max_val:.1f}",
            'mean': mean_val,
            'current_value': cluster_data[attr].median(),
            'percentile_25': cluster_data[attr].quantile(0.25),
            'percentile_75': cluster_data[attr].quantile(0.75)
        }
    
    # Calculate distributions for categorical attributes
    categorical_attrs = ['cholesterol', 'gluc', 'smoke', 'alco', 'active']
    for attr in categorical_attrs:
        # Find most common value and calculate value distributions
        mode_val = cluster_data[attr].mode().iloc[0]
        value_counts = cluster_data[attr].value_counts(normalize=True)
        
        optimal_ranges[attr] = {
            'recommended_value': mode_val,
            'distribution': value_counts.to_dict(),
            'confidence': f"{value_counts.max()*100:.1f}%"
        }
    
    return optimal_ranges

def suggest_adjustments(test_patient, cluster_recommendations, healthy_patients):
    # Create age group for test patient (in 5-year intervals)
    age_bin = pd.cut([test_patient['age_years']], bins=range(30, 70, 5))[0]
    # Create unique cluster identifier based on age group, gender, and height
    cluster_key = (age_bin, test_patient['gender'], test_patient['height'])
    
    if cluster_key in cluster_recommendations:
        recommendations = cluster_recommendations[cluster_key]
        
        # Count how many similar patients exist in the dataset
        cluster_size = len(healthy_patients[
            (healthy_patients['age_bin'] == age_bin) & 
            (healthy_patients['gender'] == test_patient['gender']) & 
            (healthy_patients['height'] == test_patient['height'])
        ])
        
        # Prepare analysis results
        analysis = {
            'cluster_size': cluster_size,
            'confidence_score': calculate_confidence_score(cluster_key, healthy_patients),
            'recommendations': recommendations
        }
        
        return format_recommendations(analysis)
    else:
        # If no exact match found, find the nearest similar cluster
        return find_nearest_cluster(test_patient, cluster_recommendations, healthy_patients)

def calculate_confidence_score(cluster_key, data):
    # Extract individual components from cluster key
    age_bin, gender, height = cluster_key
    
    # Count patients matching all criteria
    cluster_size = len(data[
        (data['age_bin'] == age_bin) & 
        (data['gender'] == gender) & 
        (data['height'] == height)
    ])
    
    # Calculate confidence score (1.0 = 100 or more similar patients)
    return min(cluster_size / 100, 1.0)

def format_recommendations(analysis):
    # Prepare output structure
    formatted_output = {
        'summary': {
            'cluster_size': analysis['cluster_size'],
            'confidence_score': f"{analysis['confidence_score']*100:.1f}%"
        },
        'detailed_recommendations': {}
    }
    
    # Format each recommendation type differently based on whether it's numerical or categorical
    for attr, details in analysis['recommendations'].items():
        if isinstance(details, dict):
            if 'recommended_range' in details:
                # Format numerical attributes (weight, blood pressure)
                formatted_output['detailed_recommendations'][attr] = {
                    'type': 'numerical',
                    'recommendation': details['recommended_range'],
                    'additional_info': {
                        'mean': f"{details['mean']:.1f}",
                        'normal_range': f"{details['percentile_25']:.1f} - {details['percentile_75']:.1f}"
                    }
                }
            else:
                # Format categorical attributes (cholesterol, glucose, etc.)
                formatted_output['detailed_recommendations'][attr] = {
                    'type': 'categorical',
                    'recommendation': f"Recommended level: {details['recommended_value']}",
                    'confidence': details['confidence'],
                    'distribution': details['distribution']
                }
    
    return formatted_output

# Main execution flow
# 1. Filter dataset for healthy patients only
healthy_patients = data[data['cardio'] == 0].copy()
# 2. Create age groups in 5-year intervals
healthy_patients['age_bin'] = pd.cut(healthy_patients['age_years'], bins=range(0, 120, 5))

# 3. Group patients by age, gender, and height
clusters = healthy_patients.groupby(['age_bin', 'gender', 'height'], observed=True)

# 4. Generate recommendations for each cluster
cluster_recommendations = {}
for cluster, cluster_data in clusters:
    cluster_recommendations[cluster] = calculate_optimal_ranges(cluster_data)

# 5. Example usage with a test patient
test_patient = {
    'age_years': 50,  # Age in years
    'gender': 1,      # Gender code
    'height': 170     # Height in cm
}

# 6. Get and display recommendations
recommendations = suggest_adjustments(test_patient, cluster_recommendations, healthy_patients)

# 7. Print formatted results
print("\nHealth Recommendations Analysis:")
print(f"Cluster Size: {recommendations['summary']['cluster_size']} similar patients")
print(f"Confidence Score: {recommendations['summary']['confidence_score']}\n")

print("Detailed Recommendations:")
for attr, details in recommendations['detailed_recommendations'].items():
    print(f"\n{attr.upper()}:")
    print(f"- {details['recommendation']}")
    if details['type'] == 'numerical':
        print(f"- Average value: {details['additional_info']['mean']}")
        print(f"- Normal range: {details['additional_info']['normal_range']}")
    else:
        print(f"- Confidence: {details['confidence']}")


Health Recommendations Analysis:
Cluster Size: 201 similar patients
Confidence Score: 100.0%

Detailed Recommendations:

WEIGHT:
- 50.0 - 96.3
- Average value: 73.1
- Normal range: 66.0 - 77.0

AP_HI:
- 94.8 - 139.6
- Average value: 117.2
- Normal range: 110.0 - 120.0

AP_LO:
- 63.1 - 90.5
- Average value: 76.8
- Normal range: 70.0 - 80.0

CHOLESTEROL:
- Recommended level: 1
- Confidence: 87.1%

GLUC:
- Recommended level: 1
- Confidence: 89.1%

SMOKE:
- Recommended level: 0
- Confidence: 98.5%

ALCO:
- Recommended level: 0
- Confidence: 98.0%

ACTIVE:
- Recommended level: 1
- Confidence: 80.1%
