In [8]:
# diabetes_regression.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Step 1: Read the dataset
data = pd.read_csv('diabetes_updated.csv')

# Step 2: Differentiate between independent and dependent variables
X = data.drop(columns=['Outcome'])
Y = data['Outcome']

# Step 3: Generate training and test sets (80% training, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 4: Analyze feature types and apply StandardScaler where needed
# Assume that all features are numerical and require scaling for simplicity
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Generate a multiple linear regression model using the training set
model = LinearRegression()
model.fit(X_train_scaled, Y_train)

# Step 6: Print out the intercept and coefficients of the trained model
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

# Step 7: Generate predictions for the test set
Y_pred = model.predict(X_test_scaled)

# To make the interpretation of our predictions more meaningful, we need to reverse the scaling.
# However, as we're comparing with the original `progression` which wasn't scaled, this step is not necessary here.

# Step 8: Compute R-squared for the model on the test set
r2 = r2_score(Y_test, Y_pred)
print("R-squared:", r2)

# Additional: Compare predictions to actual values
comparison = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(comparison.head())


Intercept: 0.34690553745928343
Coefficients: [ 0.03465559  0.1803234  -0.04219339  0.00820563 -0.03230381  0.11631364
  0.03744793  0.07425473]
R-squared: 0.2550028117674178
     Actual  Predicted
668       0   0.335500
324       0   0.238099
624       0   0.151052
690       0   0.240136
473       0   0.481424
