In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Handle missing data
df = df.dropna()  # Dropping rows with missing values for simplicity

# Encode categorical variables
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])  # Convert gender to numerical
df['hypertension'] = le.fit_transform(df['hypertension'])  # Yes/No to 1/0
df['heart_disease'] = le.fit_transform(df['heart_disease'])  # Yes/No to 1/0
df['smoking_history'] = le.fit_transform(df['smoking_history'])  # Yes/No to 1/0

# Split the data into features (X) and target (y)
X = df.drop('diabetes', axis=1)  # All features except the target
y = df['diabetes']  # Target is 'diabetes'

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [2]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the model
clf = LogisticRegression()

# Train the classification model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classification model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.86      0.61      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000

Accuracy Score: 0.9587


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming 'blood_glucose_level' is available in the dataset for regression
y_reg = df['blood_glucose_level']
X_reg = df.drop(['diabetes', 'blood_glucose_level'], axis=1)  # Drop target and blood glucose column

# Split into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Initialize the regression model
reg = LinearRegression()

# Train the regression model
reg.fit(X_train_reg, y_train_reg)

# Make predictions on the test set
y_pred_reg = reg.predict(X_test_reg)

# Evaluate the regression model
mse = mean_squared_error(y_test_reg, y_pred_reg)
print("Mean Squared Error (MSE):", mse)


Mean Squared Error (MSE): 1572.5656499043525


In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Handle missing data
df = df.dropna()  # Dropping rows with missing values for simplicity

# Encode categorical variables
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['hypertension'] = le.fit_transform(df['hypertension'])
df['heart_disease'] = le.fit_transform(df['heart_disease'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])

# Prepare data for classification
X = df.drop(['diabetes', 'blood_glucose_level'], axis=1)  # Exclude diabetes and blood glucose for features
y = df['diabetes']  # Target is 'diabetes'

# Scale features for classification
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets for classification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train classification model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Prepare data for regression
y_reg = df['blood_glucose_level']
X_reg = df.drop(['diabetes', 'blood_glucose_level'], axis=1)  # Exclude target columns

# Scale features for regression using the same scaler
X_reg_scaled = scaler.transform(X_reg)

# Split into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg_scaled, y_reg, test_size=0.2, random_state=42)

# Train regression model
reg = LinearRegression()
reg.fit(X_train_reg, y_train_reg)

# Make predictions
y_pred_reg = reg.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
print("Mean Squared Error (MSE):", mse)

# Predict for new data
new_data = pd.DataFrame({
    'gender': [0],  # Male
    'age': [67],
    'hypertension': [0],  # Yes
    'heart_disease': [1],  # No
    'smoking_history': [0],  # No
    'bmi': [27.19],
    'HbA1c_level': [6.7]
})

# Preprocess and scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data)

# Predict diabetes
diabetes_prediction = clf.predict(new_data_scaled)
print("Diabetes Prediction:", diabetes_prediction)

# Predict blood glucose level
blood_glucose_prediction = reg.predict(new_data_scaled)
print("Predicted Blood Glucose Level:", blood_glucose_prediction)


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     18292
           1       0.84      0.43      0.57      1708

    accuracy                           0.94     20000
   macro avg       0.90      0.71      0.77     20000
weighted avg       0.94      0.94      0.94     20000

Accuracy Score: 0.94465
Mean Squared Error (MSE): 1572.5656499043523
Diabetes Prediction: [0]
Predicted Blood Glucose Level: [153.72222803]
