In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [12]:
df = pd.read_csv('data/csv/user_data.csv') 
df.columns

Index(['id', 'age', 'gender', 'race_ethnicity', 'socio_economic_status',
       'urban_rural', 'parental_involvement', 'hobbies', 'Openness',
       'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism',
       'average_positive_probability_before', 'count_id_before',
       'average_positive_probability_after', 'count_id_after', 'diff',
       'mental_health_score_before', 'mental_health_score_after',
       'score_change', 'gratitude', 'journals', 'sunny', 'cloudy', 'rainy',
       'snowy', 'windy', 'exercise', 'movie_tv', 'gaming', 'reading',
       'instrument', 'walk', 'music', 'drawing', 'class', 'study', 'homework',
       'exam', 'sleep'],
      dtype='object')

In [13]:

# Load your data into a DataFrame
# df = pd.read_csv('your_data.csv')

# Drop unnecessary columns for modeling
removed_cols = ['id', 'hobbies', 'average_positive_probability_before', 
                'average_positive_probability_after', 'diff','score_change', 
                'count_id_after','count_id_before','urban_rural','race_ethnicity']
df = df.drop(columns=removed_cols)
df.shape

(300, 31)

In [14]:
df = df.apply(pd.to_numeric, errors='coerce')  # Convert all columns to numeric
df = df.fillna(df.mean())  # Impute missing values with mean

# Define categorical columns
categorical_columns = ['gender', 'socio_economic_status', 'parental_involvement', 'Openness',
                       'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']

# Convert categorical columns to one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Define features (X) and target (y)
X = df.drop(columns=['mental_health_score_after'])
y = df['mental_health_score_after']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Initialize XGBoost with default parameters
default_xgb = XGBRegressor(random_state=42)
# Fit the model to the training data
default_xgb.fit(X_train, y_train)


In [18]:
# Predict on the test data
y_pred = default_xgb.predict(X_test)

# Calculate Mean Squared Error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2): {r2:.2f}')

Mean Squared Error (MSE): 131.81
R-squared (R2): 0.78


In [19]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best parameters found by GridSearchCV: {best_params}')
print(f'Best score (negative MSE): {best_score:.2f}')

# Get the best estimator
best_xgb = grid_search.best_estimator_

# Predict using the best estimator
y_pred_tuned = best_xgb.predict(X_test)

# Evaluate tuned model performance
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
print(f'Tuned model MSE: {mse_tuned:.2f}')

Fitting 5 folds for each of 6561 candidates, totalling 32805 fits
Best parameters found by GridSearchCV: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.8}
Best score (negative MSE): -90.73
Tuned model MSE: 108.31


In [20]:
# Predict on the test data
y_pred = best_xgb.predict(X_test)

# Calculate Mean Squared Error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2): {r2:.2f}')

Mean Squared Error (MSE): 108.31
R-squared (R2): 0.82
