<a href="https://colab.research.google.com/github/ryeleap/HealthTrackerLLM/blob/main/healthyDay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
import pickle
import gspread
from google.colab import auth
from google.auth import default

# Authenticate and create the client
auth.authenticate_user()
credentials, _ = default()
gc = gspread.authorize(credentials)

# Generate balanced simulated data
np.random.seed(42)
data_healthy = {
    'steps': np.random.normal(loc=10000, scale=2000, size=500).astype(int),
    'sleep_hours': np.random.normal(loc=8, scale=0.5, size=500),
    'water_liters': np.random.normal(loc=2.5, scale=0.5, size=500)
}
data_unhealthy = {
    'steps': np.random.normal(loc=5000, scale=2000, size=500).astype(int),
    'sleep_hours': np.random.normal(loc=6, scale=1.5, size=500),
    'water_liters': np.random.normal(loc=1.5, scale=0.5, size=500)
}
df_healthy = pd.DataFrame(data_healthy)
df_unhealthy = pd.DataFrame(data_unhealthy)
df_healthy['healthy'] = 1
df_unhealthy['healthy'] = 0

df = pd.concat([df_healthy, df_unhealthy], ignore_index=True)

# Define the classification function
def classify_day(row):
    if (row['steps'] >= 6000 and
        7 <= row['sleep_hours'] <= 9 and
        2 <= row['water_liters'] <= 3):
        return 1
    else:
        return 0

# Apply the classification function
df['healthy'] = df.apply(classify_day, axis=1)

# Split the data into features and target
X = df[['steps', 'sleep_hours', 'water_liters']]
y = df['healthy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fine-tune the Random Forest model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and train the final model
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
rf_final = RandomForestClassifier(**best_params, random_state=42)
rf_final.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf_final.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)

# Print evaluation metrics for the Random Forest model
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')
print("Random Forest Confusion Matrix:\n", cm_rf)
print(f'Random Forest Precision: {precision_rf:.2f}')
print(f'Random Forest Recall: {recall_rf:.2f}')

# Check feature importance
feature_importances = rf_final.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(importance_df)

# Save the trained Random Forest model
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(rf_final, model_file)

print("Random Forest model saved successfully.")

# Authenticate and create the client
gc = gspread.authorize(credentials)

# Load data from Google Sheet
worksheet = gc.open('healthyDaySheet').sheet1
rows = worksheet.get_all_values()
data = pd.DataFrame.from_records(rows[1:], columns=rows[0])

# Display the loaded data to check for any issues
print("Loaded data from Google Sheets:")
print(data)

# Convert data types and clean data, ignoring the first column
data['steps'] = pd.to_numeric(data['steps'], errors='coerce')
data['sleep_hours'] = pd.to_numeric(data['sleep_hours'], errors='coerce')
data['water_liters'] = pd.to_numeric(data['water_liters'], errors='coerce')

# Check for any NaN values and handle them
print("Data types after conversion:")
print(data.dtypes)

print("Data with NaN values removed:")
data = data.dropna()
print(data)

# Manually classify the sample data
data['manual_classification'] = data.apply(classify_day, axis=1)

# Debugging: Verify manual classification against criteria
print("Manual classification and data:")
print(data[['steps', 'sleep_hours', 'water_liters', 'manual_classification']])

# Predict using the Random Forest model, ignoring the first column
X_real = data[['steps', 'sleep_hours', 'water_liters']]
real_predictions = rf_final.predict(X_real)
data['results'] = real_predictions.astype(int)

# Debugging: Print data to verify the results column is updated
print("Data with predictions and manual classifications:")
print(data[['steps', 'sleep_hours', 'water_liters', 'results', 'manual_classification']])

# Convert DataFrame back to a list of lists for updating Google Sheet
updated_data = [data.columns.values.tolist()] + data.values.tolist()
worksheet.update('A1', updated_data)

print("Google Sheet updated successfully.")


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 1.00
Random Forest Confusion Matrix:
 [[142   0]
 [  0  58]]
Random Forest Precision: 1.00
Random Forest Recall: 1.00
Feature Importances:
        Feature  Importance
2  water_liters    0.553505
1   sleep_hours    0.227556
0         steps    0.218939
Random Forest model saved successfully.
Loaded data from Google Sheets:
            Timestamp steps sleep_hours water_liters results  \
0              SAMPLE  5000         8.2            2       0   
1              SAMPLE  9000         7.5            3       1   
2              SAMPLE  8500           8          3.2       0   
3              SAMPLE  8500           8          2.8       1   
4  5/22/2024 19:15:29  8200         8.2          2.8           

  manual_classification  
0                     0  
1                     1  
2               

  worksheet.update('A1', updated_data)


Google Sheet updated successfully.
