## Machine Learning
# Project: Diabetes Detection Model
## Chetan Rahane (crr220000)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time

In [8]:
# Load the dataset
data = pd.read_csv('diabetese_50000.csv')
data.head()
data.isnull().sum()

# Proceed with processing your dataset
X = data.iloc[:, 1:]  # Features (all columns except the first one)
y = data.iloc[:, 0] 

In [3]:
print("First 10 rows of Features (X):")
print(X.head(10))

# Display first 10 rows of target (y)
print("\nFirst 10 rows of Target (y):")
print(y.head(10))

First 10 rows of Features (X):
   HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  HeartDiseaseorAttack  \
0     1.0       0.0        1.0  30.0     1.0     0.0                   0.0   
1     1.0       1.0        1.0  27.0     1.0     0.0                   0.0   
2     0.0       0.0        1.0  22.0     0.0     0.0                   0.0   
3     0.0       0.0        1.0  38.0     0.0     0.0                   0.0   
4     0.0       0.0        1.0  32.0     0.0     0.0                   0.0   
5     0.0       0.0        1.0  30.0     0.0     0.0                   0.0   
6     1.0       1.0        1.0  36.0     1.0     0.0                   1.0   
7     0.0       0.0        1.0  35.0     0.0     0.0                   0.0   
8     0.0       0.0        1.0  19.0     0.0     0.0                   0.0   
9     1.0       1.0        1.0  27.0     1.0     0.0                   0.0   

   PhysActivity  Fruits  Veggies  ...  AnyHealthcare  NoDocbcCost  GenHlth  \
0           1.0     0.0      1.0

## Code for Decision Tree


In [4]:
def run_decision_tree(param_grid, folds=10):

    start_time = time.time()
    # Initialize 10-fold cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    
    # Store accuracy for each fold
    fold_accuracies = []
    best_params_overall = None
    # Perform K-Fold Cross-Validation
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Initialize DecisionTreeClassifier
        clf_en = DecisionTreeClassifier(random_state=0)
        
        # Use GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf_en, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Get the best model and parameters from GridSearch
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        # Print the best parameters for this fold
        print(f"Fold {fold}: Best parameters: {best_params}")
    
        # Predict on the test set using the best model
        y_pred_en = best_model.predict(X_test)
        
        # Compute accuracy for this fold
        accuracy = accuracy_score(y_test, y_pred_en)
        fold_accuracies.append(accuracy)    
        # Print accuracy for the current fold
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        # Print confusion matrix
        cm = confusion_matrix(y_test, y_pred_en)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")
        
    # Calculate mean and standard deviation of accuracy after all folds
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    # Calculate final accuracy on the entire dataset using the best model
    final_model = grid_search.best_estimator_  # Use the last best model trained
    final_accuracy = accuracy_score(y, final_model.predict(X))  # Final accuracy on the entire dataset
    end_time = time.time()
    execution_time = end_time - start_time  # Calculate the total time taken

    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total time taken for execution: {execution_time:.2f} seconds")
    # Print the best parameters from the overall process
    print(f"\nBest parameters for the model: {best_params_overall}")
    
    # Print chosen parameters from the best model
    chosen_parameters = final_model.get_params()
    print(f"\nChosen parameters for the best model: {chosen_parameters}")

## Code for Random Forest

In [5]:
def run_random_forest(param_grid, folds=10):
    start_time = time.time()
    # Initialize 10-fold cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    # Store accuracy for each fold
    fold_accuracies = []
    best_params_overall = None
    # Perform K-Fold Cross-Validation
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf_en = RandomForestClassifier(random_state=42)
        
        # Use GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf_en, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Get the best model and parameters from GridSearch
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        print(f"Fold {fold}: Best parameters: {best_params}")
            
        # Predict on the test set using the best model
        y_pred_en = best_model.predict(X_test)
        
        # Compute accuracy for this fold
        accuracy = accuracy_score(y_test, y_pred_en)
        fold_accuracies.append(accuracy)
        
        # Print confusion matrix
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        cm = confusion_matrix(y_test, y_pred_en)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")
        
    # Calculate mean and standard deviation of accuracy after all folds
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    # Calculate final accuracy on the entire dataset using the best model
    final_model = grid_search.best_estimator_  # Use the last best model trained
    final_accuracy = accuracy_score(y, final_model.predict(X))  # Final accuracy on the entire dataset
    end_time = time.time()
    execution_time = end_time - start_time  # Calculate the total time taken

    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total time taken for execution: {execution_time:.2f} seconds")
    # Print the best parameters from the overall process
    print(f"\nBest parameters for the model: {best_params_overall}")
    
    # Print chosen parameters from the best model
    chosen_parameters = final_model.get_params()
    print(f"\nChosen parameters for the best model: {chosen_parameters}")

## Runs of Decision Tree

In [13]:

# Define parameter grid for hyperparameter tuning
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.0, 0.01, 0.1]
}
run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Fold 1: Accuracy: 72.14%
Fold 1: Confusion Matrix:
[[1671  864]
 [ 529 1936]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Fold 2: Accuracy: 73.60%
Fold 2: Confusion Matrix:
[[1790  746]
 [ 574 1890]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Fold 3: Accuracy: 74.28%
Fold 3: Confusion Matrix:
[[1866  639]
 [ 647 1848]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 4: Accuracy: 74.08%
Fold 4: Confusion Matrix:
[[1693  781]
 [ 515 2011]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Fold 5: Accuracy: 75.08

In [14]:
param_grid = {
    'criterion': ['entropy'],  # Only using entropy
    'max_depth': [None, 5, 10, 15],  # Consider a broader range of depths
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split
    'min_samples_leaf': [1, 2, 5],  # Minimum samples at a leaf node
    'ccp_alpha': [0.0, 0.01, 0.1],  # Complexity parameter for pruning
    'max_features': [None, 'sqrt', 'log2']  # Valid options for max_features
}

run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Fold 1: Accuracy: 72.14%
Fold 1: Confusion Matrix:
[[1671  864]
 [ 529 1936]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Fold 2: Accuracy: 73.60%
Fold 2: Confusion Matrix:
[[1790  746]
 [ 574 1890]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Fold 3: Accuracy: 73.88%
Fold 3: Confusion Matrix:
[[1875  630]
 [ 676 1819]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Fold 4: Accuracy: 73.92%
Fold 4: Confusion Matrix:
[[1701  773]
 [ 531 1995]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', '

In [16]:
param_grid = {
    'criterion': ['entropy'],  # Only using entropy
    'max_depth': [None, 4, 6, 8, 12],  # More varied max depths
    'min_samples_split': [2, 3, 5, 8],  # Adjusting min_samples_split
    'min_samples_leaf': [1, 2, 3, 5],  # More options for min_samples_leaf
    'ccp_alpha': [0.0, 0.005, 0.01, 0.1],  # Adjusting complexity parameter for pruning
    'max_features': [None, 'sqrt', 'log2']  # Keep valid options for max_features
}
run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 8}
Fold 1: Accuracy: 73.40%
Fold 1: Confusion Matrix:
[[1820  715]
 [ 615 1850]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 8}
Fold 2: Accuracy: 74.10%
Fold 2: Confusion Matrix:
[[1854  682]
 [ 613 1851]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 8}
Fold 3: Accuracy: 74.40%
Fold 3: Confusion Matrix:
[[1881  624]
 [ 656 1839]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 4: Accuracy: 74.78%
Fold 4: Confusion Matrix:
[[1819  655]
 [ 606 1920]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max

In [17]:
param_grid = {
    'criterion': ['entropy'],  # Only using entropy
    'max_depth': [5, 7, 9, None],  # Exploring different depths
    'min_samples_split': [2, 4, 6],  # Different values for splitting
    'min_samples_leaf': [1, 3, 5],  # Keeping a range for minimum samples at leaves
    'ccp_alpha': [0.0, 0.002, 0.005, 0.1],  # Fine-tuning ccp_alpha
    'max_features': ['sqrt', 'log2']  # Limiting features for the splits
}
run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Fold 1: Accuracy: 70.96%
Fold 1: Confusion Matrix:
[[1680  855]
 [ 597 1868]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4}
Fold 2: Accuracy: 72.70%
Fold 2: Confusion Matrix:
[[1726  810]
 [ 555 1909]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 6}
Fold 3: Accuracy: 74.18%
Fold 3: Confusion Matrix:
[[1764  741]
 [ 550 1945]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2}
Fold 4: Accuracy: 71.66%
Fold 4: Confusion Matrix:
[[1635  839]
 [ 578 1948]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entrop

In [18]:
param_grid = {
    'criterion': ['entropy'],  # Only using entropy
    'max_depth': [None, 4, 5, 6, 8, 10],  # Expanding max_depth options
    'min_samples_split': [2, 3, 4, 5, 6],  # More varied splitting
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Adjusting leaf node sizes
    'ccp_alpha': [0.0, 0.001, 0.005, 0.02],  # Fine-tuning pruning parameter
    'max_features': [None, 'sqrt', 'log2']  # Feature selection options
}
run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
Fold 1: Accuracy: 73.36%
Fold 1: Confusion Matrix:
[[1816  719]
 [ 613 1852]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Fold 2: Accuracy: 74.10%
Fold 2: Confusion Matrix:
[[1854  682]
 [ 613 1851]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Fold 3: Accuracy: 74.42%
Fold 3: Confusion Matrix:
[[1881  624]
 [ 655 1840]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 4: Accuracy: 74.78%
Fold 4: Confusion Matrix:
[[1819  655]
 [ 606 1920]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max

In [19]:
param_grid = {
    'criterion': ['entropy'],  # Only using entropy
    'max_depth': [None, 3, 7, 9],  # Adjusting depth options
    'min_samples_split': [2, 4],  # Two splitting options
    'min_samples_leaf': [1, 3],  # Two leaf options
}
run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)

## this is giving the best result with Decision Tree

Fold 1: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 1: Accuracy: 73.42%
Fold 1: Confusion Matrix:
[[1788  747]
 [ 582 1883]]

Fold 2: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 4}
Fold 2: Accuracy: 73.98%
Fold 2: Confusion Matrix:
[[1789  747]
 [ 554 1910]]

Fold 3: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 3: Accuracy: 74.38%
Fold 3: Confusion Matrix:
[[1869  636]
 [ 645 1850]]

Fold 4: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 4: Accuracy: 74.64%
Fold 4: Confusion Matrix:
[[1732  742]
 [ 526 2000]]

Fold 5: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 5: Accuracy: 74.94%
Fold 5: Confusion Matrix:
[[1731  699]
 [ 554 2016]]

Fold 6: Best parameters: {'criterion': '

In [20]:
param_grid = {
    'criterion': ['entropy'],  # Only using entropy
    'max_depth': [6, 7, 8, 9, 10],  # Adjusting depth options
    'min_samples_split': [2, 4],  # Two splitting options
    'min_samples_leaf': [1, 3],  # Two leaf options
}
run_decision_tree(param_grid)
print('With 20 folds:')
run_decision_tree(param_grid, 20)


Fold 1: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 1: Accuracy: 73.42%
Fold 1: Confusion Matrix:
[[1788  747]
 [ 582 1883]]

Fold 2: Best parameters: {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 2}
Fold 2: Accuracy: 74.10%
Fold 2: Confusion Matrix:
[[1854  682]
 [ 613 1851]]

Fold 3: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 3: Accuracy: 74.38%
Fold 3: Confusion Matrix:
[[1869  636]
 [ 645 1850]]

Fold 4: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 4: Accuracy: 74.64%
Fold 4: Confusion Matrix:
[[1732  742]
 [ 526 2000]]

Fold 5: Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Fold 5: Accuracy: 74.94%
Fold 5: Confusion Matrix:
[[1731  699]
 [ 554 2016]]

Fold 6: Best parameters: {'criterion': '

## Runs with Random Forest

In [6]:
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.0, 0.01, 0.1]
}
run_random_forest(param_grid)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Fold 1: Accuracy: 74.24%
Fold 1: Confusion Matrix:
[[1761  774]
 [ 514 1951]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Fold 2: Accuracy: 74.62%
Fold 2: Confusion Matrix:
[[1795  741]
 [ 528 1936]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 3: Accuracy: 75.72%
Fold 3: Confusion Matrix:
[[1831  674]
 [ 540 1955]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Fold 4: Accuracy: 75.12%
Fold 4: Confusion Matrix:
[[1757  717]
 [ 527 1999]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 5: Accuracy: 75.

In [7]:
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [10,20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.0, 0.01, 0.1, 0.001]
}
run_random_forest(param_grid)


Fold 1: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Fold 1: Accuracy: 74.24%
Fold 1: Confusion Matrix:
[[1761  774]
 [ 514 1951]]

Fold 2: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Fold 2: Accuracy: 74.62%
Fold 2: Confusion Matrix:
[[1795  741]
 [ 528 1936]]

Fold 3: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 3: Accuracy: 75.72%
Fold 3: Confusion Matrix:
[[1831  674]
 [ 540 1955]]

Fold 4: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Fold 4: Accuracy: 75.12%
Fold 4: Confusion Matrix:
[[1757  717]
 [ 527 1999]]

Fold 5: Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 5: Accuracy: 75.

In [9]:
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 20, 10],
    'min_samples_leaf': [1, 8, 4]
}
run_random_forest(param_grid, 20)




Fold 1: Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 1: Accuracy: 73.92%
Fold 1: Confusion Matrix:
[[876 388]
 [264 972]]





Fold 2: Best parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 8, 'min_samples_split': 2}
Fold 2: Accuracy: 74.40%
Fold 2: Confusion Matrix:
[[879 392]
 [248 981]]

Fold 3: Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 3: Accuracy: 75.04%
Fold 3: Confusion Matrix:
[[895 361]
 [263 981]]

Fold 4: Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 20}
Fold 4: Accuracy: 74.80%
Fold 4: Confusion Matrix:
[[905 375]
 [255 965]]

Fold 5: Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Fold 5: Accuracy: 76.80%
Fold 5: Confusion Matrix:
[[957 329]
 [251 963]]

Fold 6: Best parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 8, 'min_samples_split': 20}
Fold 6: Accuracy: 75.32%
Fold 6: Confusion Matrix:
[[ 882  337]
 [ 280 1001]]

Fold 7: Best parameters: {'criterion': 'entropy

## Result
Judging from above runs, it is found that between Random forest and Decision tree, the Random forest classification is giving better results. Therefor, I will be choosing the best Accuracy result from Random forest runs for making the model.

In [5]:
import pickle

param_grid = {
    'criterion': 'entropy',
    'max_depth': 20,
    'min_samples_split': 10,
    'min_samples_leaf': 4
}

# Initialize and fit the RandomForestClassifier with the chosen parameters
rf_model = RandomForestClassifier(
    criterion=param_grid['criterion'],
    max_depth=param_grid['max_depth'],
    min_samples_split=param_grid['min_samples_split'],
    min_samples_leaf=param_grid['min_samples_leaf'],
    random_state=42
)

# Fit the model on the training data
rf_model.fit(X, y)

# Now, save the trained model to a pickle file
model_filename = 'result_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(rf_model, file)

print(f"Model saved to {model_filename}")


NameError: name 'X' is not defined

## Testing

In [9]:
model_filename = 'result_model.pkl'
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

test_data = pd.read_csv('test kunal.csv')

X_test = test_data.drop('Has_diabetes', axis=1)  # Features
y_test = test_data['Has_diabetes']  # Target column

y_pred_test = loaded_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Accuracy of the loaded model on the test set: {test_accuracy * 100:.2f}%")

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
print(f"Confusion Matrix on Test Data:\n{cm}")

Accuracy of the loaded model on the test set: 0.00%
Confusion Matrix on Test Data:
[[0 0]
 [1 0]]
