# Diabetes Prediction Model with PCA

## Part 1: Import Libraries and Load Data

In [8]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Display the first few rows of the dataset
print(df.head())


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


## Part 2: Data Preprocessing

In [9]:

# Handle missing data
df = df.dropna()  # Dropping rows with missing values for simplicity

# Encode categorical variables
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['hypertension'] = le.fit_transform(df['hypertension'])
df['heart_disease'] = le.fit_transform(df['heart_disease'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])

# Prepare data for classification
X = df.drop(['diabetes', 'blood_glucose_level'], axis=1)  # Exclude both target and blood glucose level for features
y = df['diabetes']  # Target variable

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data preprocessing completed.")


Data preprocessing completed.


## Part 3: Applying PCA

In [10]:

# Reduce to 2 principal components for simplicity, or choose a number based on variance explained
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Display the explained variance ratio of the PCA components
print("Explained Variance Ratio of PCA components:", pca.explained_variance_ratio_)


Explained Variance Ratio of PCA components: [0.25094205 0.16036498]


## Part 4: Train-Test Split with PCA-transformed Data

In [11]:

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

print("Train-test split with PCA completed.")


Train-test split with PCA completed.


## Part 5: Model Training

In [12]:

# Train classification model
clf = LogisticRegression()
clf.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


## Part 6: Model Evaluation

In [13]:

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classification model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96     18292
           1       0.54      0.14      0.22      1708

    accuracy                           0.92     20000
   macro avg       0.73      0.56      0.59     20000
weighted avg       0.89      0.92      0.89     20000

Accuracy Score: 0.9165


## Part 7: Predict for New Data

In [14]:

# Prepare new data for prediction (without the blood glucose level)
new_data = pd.DataFrame({
    'gender': [1],  # Male
    'age': [45],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [0],  # No
    'bmi': [27.5],
    'HbA1c_level': [6.2]
})

# Preprocess and scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data)

# Apply PCA transformation to the new scaled data
new_data_pca = pca.transform(new_data_scaled)

# Predict diabetes for the new data
diabetes_prediction = clf.predict(new_data_pca)
print("Diabetes Prediction (1=Yes, 0=No):", diabetes_prediction)


Diabetes Prediction (1=Yes, 0=No): [0]


## Additional Prediction: High-Risk Case

In [15]:

# Example case where diabetes is likely to be 1
new_data_high_risk = pd.DataFrame({
    'gender': [1],  # Male
    'age': [60],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [30.5],  # Obese
    'HbA1c_level': [7.5]  # Poor blood sugar control
})

# Preprocess and scale the new high-risk data using the same scaler
new_data_high_risk_scaled = scaler.transform(new_data_high_risk)

# Apply PCA transformation to the high-risk data
new_data_high_risk_pca = pca.transform(new_data_high_risk_scaled)

# Predict diabetes for the new high-risk data
diabetes_prediction_high_risk = clf.predict(new_data_high_risk_pca)
print("Diabetes Prediction for High-Risk Case (1=Yes, 0=No):", diabetes_prediction_high_risk)


Diabetes Prediction for High-Risk Case (1=Yes, 0=No): [1]
