In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
from google.cloud import bigquery

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'biquerykey.json'
client = bigquery.Client()

In [5]:
dim_patients_table = """SELECT * FROM `bigqueryimdb.healthcare.dim_patients`;"""

In [6]:
fact_visits_table = """SELECT * FROM `bigqueryimdb.healthcare.fact_visits`;"""

In [7]:
df_patients = client.query(dim_patients_table).to_dataframe()

In [8]:
df_visits = client.query(fact_visits_table).to_dataframe() # reading the patients visits fact table

In [10]:
df_patients.head()

Unnamed: 0,patient_id,date_of_birth,aadhar_number,mobile_number,email,first_name,last_name,full_name,gender,height,weight,bmi,area,latitude,longitude
0,1053,1973-04-24,317320301691,917108801293,frose@example.net,Pandian,Govindan,Mr. Pandian Govindan,Male,169,57,20.0,Mandaveli,13.029,80.259
1,1111,1990-06-25,228343284543,914826294247,danielleanderson@example.com,Chandrasekar,Bhagavathi,Mr. Chandrasekar Bhagavathi,Male,157,65,26.4,Mandaveli,13.029,80.259
2,1128,1949-01-06,819907684377,915974958012,jerryfrank@example.net,Naveen,Rathinam,Mr. Naveen Rathinam,Male,151,57,25.0,Mandaveli,13.029,80.259
3,1132,1960-01-13,612453976088,918818220132,moodyryan@example.com,Harish,Senthil,Mr. Harish Senthil,Male,169,96,33.6,Mandaveli,13.029,80.259
4,1212,1967-04-17,384827959428,915647948599,scottjones@example.com,Chandrasekar,Palanisamy,Mr. Chandrasekar Palanisamy,Male,155,75,31.2,Mandaveli,13.029,80.259


In [11]:
df_visits.head()

Unnamed: 0,patient_id,visited_date,sugar,hba1c,systolic_pressure,diastolic_pressure
0,1967,2024-03-13,85,5.0,96,80
1,2172,2023-11-16,114,5.0,108,80
2,2239,2023-11-12,61,5.0,134,80
3,2749,2023-04-29,94,5.0,102,80
4,3635,2023-08-07,83,5.0,90,80


In [12]:


warnings.filterwarnings("ignore")


# Select relevant columns
df = df_visits[['patient_id', 'visited_date', 'sugar', 'hba1c']]

# Convert visited_date to datetime 
df['visited_date'] = pd.to_datetime(df['visited_date'])


# Extracting year, month, and day from visited_date
df['year'] = df['visited_date'].dt.year
df['month'] = df['visited_date'].dt.month
df['day'] = df['visited_date'].dt.day

# Dropping the original visited_date column
df = df.drop(columns=['visited_date'])

# Defining features and target
X = df.drop(columns=['hba1c'])
y = df['hba1c']

# Splitting the data
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)

# Defining the model
model = ElasticNet()

# Defining the parameters grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9]
}

# Initializing GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fitting GridSearchCV
grid_search.fit(train_x, train_y)

# Getting the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Training the model with the best parameters
best_model = grid_search.best_estimator_

# Predicting on the test set
predicted_values = best_model.predict(test_x)

# Evaluating the model
rmse = np.sqrt(mean_squared_error(test_y, predicted_values))
mae = mean_absolute_error(test_y, predicted_values)
r2 = r2_score(test_y, predicted_values)

print(f"Best ElasticNet model (alpha={best_params['alpha']}, l1_ratio={best_params['l1_ratio']}):")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found:  {'alpha': 0.5, 'l1_ratio': 0.7}
Best ElasticNet model (alpha=0.5, l1_ratio=0.7):
  RMSE: 2.0378528536041216
  MAE: 1.678142557398358
  R2: 0.41368725788614247


## Result of Grid Search

* Fitting 5 folds for each of 16 candidates, totalling 80 fits
* Best parameters found:  {'alpha': 0.5, 'l1_ratio': 0.7}
* Best ElasticNet model (alpha=0.5, l1_ratio=0.7):
*  RMSE: 2.0378528536041216
*  MAE: 1.678142557398358
*  R2: 0.41368725788614247