In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Download the Pima Indian Diabetes Dataset:
Available from sources like Kaggle.
Contains columns such as:
Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI,
DiabetesPedigreeFunction, Age, Outcome (whether the patient has
diabetes or not).
Regression Task:
Predict the Blood Pressure of the patients based on other features.
Use Linear Regression model from Scikit-learn.
Classification Task:
Predict whether the patient has diabetes (target column: Outcome).
Use Logistic Regression or K-Nearest Neighbors (KNN) model.
Once Build Evaluate the Model Appropriately.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Concepts and Technologies of AI./diabetes.csv')  # Replace with the correct file path

# Check for missing values and basic statistics
print(df.info())
print(df.describe())

# Dynamically check column names to handle mismatches
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
valid_columns = [col for col in columns_to_replace if col in df.columns]

if len(valid_columns) < len(columns_to_replace):
    print(f"Some columns were not found: {set(columns_to_replace) - set(valid_columns)}")

# Replace zeros in valid columns with NaN
df[valid_columns] = df[valid_columns].replace(0, np.nan)

# Fill missing values with the median
df.fillna(df.median(), inplace=True)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std    

Regression Task: Predicting Blood Pressure

In [11]:
# Define the target column dynamically
target_column = 'BloodPressure'

# Check if the column exists
if target_column in df.columns:
    # Define features and target
    X_reg = df.drop([target_column], axis=1)
    y_reg = df[target_column]

    # Train-test split
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_reg = scaler.fit_transform(X_train_reg)
    X_test_reg = scaler.transform(X_test_reg)

    # Linear Regression model
    reg_model = LinearRegression()
    reg_model.fit(X_train_reg, y_train_reg)

    # Predictions and evaluation
    y_pred_reg = reg_model.predict(X_test_reg)
    print("Regression Task Evaluation:")
    print("Mean Squared Error:", mean_squared_error(y_test_reg, y_pred_reg))
    print("R2 Score:", r2_score(y_test_reg, y_pred_reg))
else:
    print(f"Column '{target_column}' not found in the dataset.")



Regression Task Evaluation:
Mean Squared Error: 112.13352966536159
R2 Score: 0.2297687968567813


Classification Task: Predicting Diabetes Outcome

In [12]:
# Check if the 'Outcome' column exists
if 'Outcome' in df.columns:
    # Define features and target for classification task
    X_clf = df.drop(['Outcome'], axis=1)
    y_clf = df['Outcome']

    # Train-test split
    X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_clf = scaler.fit_transform(X_train_clf)
    X_test_clf = scaler.transform(X_test_clf)

    # Logistic Regression
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train_clf, y_train_clf)
    y_pred_log = log_reg.predict(X_test_clf)

    # KNN Classifier (choose k=5)
    knn_clf = KNeighborsClassifier(n_neighbors=5)
    knn_clf.fit(X_train_clf, y_train_clf)
    y_pred_knn = knn_clf.predict(X_test_clf)

    # Evaluation
    print("Logistic Regression Evaluation:")
    print("Accuracy:", accuracy_score(y_test_clf, y_pred_log))
    print(classification_report(y_test_clf, y_pred_log))

    print("KNN Evaluation:")
    print("Accuracy:", accuracy_score(y_test_clf, y_pred_knn))
    print(classification_report(y_test_clf, y_pred_knn))
else:
    print("Column 'Outcome' not found in the dataset.")


Logistic Regression Evaluation:
Accuracy: 0.7532467532467533
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154

KNN Evaluation:
Accuracy: 0.7207792207792207
              precision    recall  f1-score   support

           0       0.80      0.75      0.77        99
           1       0.60      0.67      0.63        55

    accuracy                           0.72       154
   macro avg       0.70      0.71      0.70       154
weighted avg       0.73      0.72      0.72       154

