In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings

In [3]:
df_patients = pd.read_csv("../Data/dim_patients_final_rev01.csv")

In [4]:
df_visits = pd.read_csv("../Data/fact_visits_final_rev01.csv")

In [5]:
df_patients.head()

Unnamed: 0,patient_id,date_of_birth,aadhar_number,mobile_number,email,first_name,last_name,full_name,gender,height,weight,bmi,area,latitude,longitude
0,1000,1957-07-02,254734595276,912998937864,achang@example.org,Eswar,Rajendran,Mr. Eswar Rajendran,Male,165,94,34.5,Alwarpet,13.033,80.251
1,1001,1980-09-16,450990877668,913770928150,tammy76@example.com,Indra,Chokkalingam,Ms. Indra Chokkalingam,Female,151,74,32.5,Guindy,13.005,80.219
2,1002,1969-12-04,585965453521,911664078988,nhoward@example.net,Swathi,Iyyappan,Ms. Swathi Iyyappan,Female,162,88,33.5,Adyar,13.006,80.257
3,1003,1979-11-19,842467933153,911027314078,juancampos@example.net,Gopal,Palanisamy,Mr. Gopal Palanisamy,Male,179,93,29.0,Egmore,13.078,80.258
4,1004,1997-01-04,665339009112,918904122631,vanessa89@example.org,Fathima,Thirunavukkarasu,Ms. Fathima Thirunavukkarasu,Female,151,99,43.4,Nungambakkam,13.059,80.239


In [6]:
df_visits.head()

Unnamed: 0,patient_id,visited_date,sugar,hba1c,systolic_pressure,diastolic_pressure
0,1000,2023-04-19,262,12.0,126,96
1,1000,2023-08-14,301,13.3,124,83
2,1001,2023-04-13,157,6.8,102,99
3,1002,2023-04-09,445,13.5,127,97
4,1002,2023-07-24,550,12.2,105,99


In [7]:


warnings.filterwarnings("ignore")


# Select relevant columns
df = df_visits[['patient_id', 'visited_date', 'sugar', 'hba1c']]

# Convert visited_date to datetime 
df['visited_date'] = pd.to_datetime(df['visited_date'])


# Extracting year, month, and day from visited_date
df['year'] = df['visited_date'].dt.year
df['month'] = df['visited_date'].dt.month
df['day'] = df['visited_date'].dt.day

# Dropping the original visited_date column
df = df.drop(columns=['visited_date'])

# Defining features and target
X = df.drop(columns=['hba1c'])
y = df['hba1c']

# Splitting the data
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)

# Defining the model
model = ElasticNet()

# Defining the parameters grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9]
}

# Initializing GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fitting GridSearchCV
grid_search.fit(train_x, train_y)

# Getting the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Training the model with the best parameters
best_model = grid_search.best_estimator_

# Predicting on the test set
predicted_values = best_model.predict(test_x)

# Evaluating the model
rmse = np.sqrt(mean_squared_error(test_y, predicted_values))
mae = mean_absolute_error(test_y, predicted_values)
r2 = r2_score(test_y, predicted_values)

print(f"Best ElasticNet model (alpha={best_params['alpha']}, l1_ratio={best_params['l1_ratio']}):")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found:  {'alpha': 0.5, 'l1_ratio': 0.7}
Best ElasticNet model (alpha=0.5, l1_ratio=0.7):
  RMSE: 2.0109321397445847
  MAE: 1.6713187671935066
  R2: 0.4214615482514946


## Result of Grid Search

* Fitting 5 folds for each of 16 candidates, totalling 80 fits
* Best parameters found:  {'alpha': 0.5, 'l1_ratio': 0.7}
* Best ElasticNet model (alpha=0.5, l1_ratio=0.7):
*  RMSE: 2.0109321397445847
*  MAE: 1.6713187671935066
*  R2: 0.4214615482514946