# Final project

# Project Title: Diabetes Prediction Using AI and Machine Learning Algorithms


### Utilities


In [None]:
def load_pima_diabetes_data(url: str, column_names: list) -> pd.DataFrame:
    """Loads the Pima Indians Diabetes dataset from a given URL."""
    try:
        df = pd.read_csv(url, names=column_names)
        print("Dataset loaded successfully.")
        return df
    except Exception as e:
        print(f"Failed to load dataset: {e}")
        return pd.DataFrame()  # Return empty DataFrame on failure

In [None]:
def plot_feature_correlation_heatmap(dataframe, figsize=(10, 8), cmap='coolwarm'):
    # heatmap that shows how different features (or columns) in a dataset are correlated with each other.
    # A heatmap is a visual representation of data where individual values are represented by colors.
    # In this case, it helps us see the strength of relationships between the columns.

    """Plots a heatmap showing feature correlations in the given DataFrame."""
    correlation_matrix = dataframe.corr()

    plt.figure(figsize=figsize)
    sns.heatmap(correlation_matrix, annot=True, cmap=cmap, fmt=".2f", linewidths=0.5, square=True)
    plt.title("Feature Correlation Heatmap", fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    #plt.show()

## Load data and preprocessing

## Principle Model - build model, train&test, performance

## Additional Model 1 - build model, train&test, performance

## Additional Model 1 - build model, train&test, performance

In [None]:
#train_knn_classifier function is like our personal coach for building a K-Nearest Neighbors (KNN) model.
#It takes our training data and teaches the model how to recognize patterns
#‚Äî so later, it can predict whether someone has diabetes based on their health features.

def train_knn_classifier(X_train, y_train, n_neighbors=5):
    """
    Trains a K-Nearest Neighbors classifier on the provided training data.

    Parameters:
        X_train (array-like): Scaled training features.
        y_train (array-like): Training labels.
        n_neighbors (int): Number of neighbors to use (default is 5).

    Returns:
        knn (KNeighborsClassifier): Trained KNN model.
    """

    #X_train: This is our input data ‚Äî things like glucose level, age, BMI, etc.
    #y_train: These are the correct answers ‚Äî whether each person has diabetes (1) or not (0).
    #n_neighbors=5: This tells the model to look at the 5 closest neighbors when making a prediction.
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    return knn

In [None]:
# display_classification_report function is like our model‚Äôs performance summary sheet.
# After our model makes predictions, it prints out a detailed report showing how well it did ‚Äî
# not just overall, but for each class (like diabetic vs non-diabetic).
def display_classification_report(y_true, y_pred, target_names=None):
    """
    Prints a formatted classification report showing precision, recall, f1-score, and support.

    Parameters:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        target_names (list, optional): Names for target classes (e.g., ['No Diabetes', 'Diabetes']).
    """

    # This generates and prints a table that shows:

    # Precision: How often the model was right when it predicted a class.

    # Recall: How many actual cases the model correctly identified.

    # F1-score: A balance between precision and recall.

    # Support: How many examples were in each class.

    # If we provide target_names, it replaces the default 0/1 labels
    # with something more readable like ‚ÄúNo Diabetes‚Äù and ‚ÄúDiabetes.‚Äù
    print("\nüìã Classification Report:\n")
    report = classification_report(y_true, y_pred, target_names=target_names)
    print(report)

In [None]:
print(df.describe())
print(df['Outcome'].value_counts())

In [None]:
logistic_regression_model = train_logistic_regression(X_train_scaled, y_train)

In [None]:
y_pred, y_prob = make_predictions(logistic_regression_model, X_test_scaled)

In [None]:
evaluate_model_performance(y_test, y_pred, y_prob)

In [None]:
display_classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])