In [1]:
!pip install numpy pandas scikit-learn xgboost



In [18]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error

### Regression

In [19]:
# Final features from feature selection
features = ['circumplex.valence','appCat.weather','circumplex.arousal','appCat.entertainment',
 'appCat.travel','appCat.utilities','appCat.builtin','appCat.communication','sms','screen','activity']

# Read data
df = pd.read_csv('data/data_imputed.csv', index_col=['id', 'time'])
df.reset_index(inplace=True)
df['time'] = pd.to_datetime(df['time'])

# Shift the 'mood' column up to represent the next day's mood
df['mood_next_day'] = df.groupby('id')['mood'].shift(-1)
df.dropna(subset=['mood_next_day'], inplace=True)

# Create rolling windows for the features for the last 5 days
for feature in features:
    for days in range(1, 6):
        df[f'{feature}_lag_{days}'] = df.groupby('id')[feature].shift(days)
new_features = [f"{feat}_lag_{day}" for feat in features for day in range(1, 6)]

# Define features and target
X = df.drop(['mood_next_day', 'id', 'time', 'mood'], axis=1)  # Drop non-feature columns and the target
X = X[new_features]
y = df['mood_next_day']

In [20]:
X

Unnamed: 0,circumplex.valence_lag_1,circumplex.valence_lag_2,circumplex.valence_lag_3,circumplex.valence_lag_4,circumplex.valence_lag_5,appCat.weather_lag_1,appCat.weather_lag_2,appCat.weather_lag_3,appCat.weather_lag_4,appCat.weather_lag_5,...,screen_lag_1,screen_lag_2,screen_lag_3,screen_lag_4,screen_lag_5,activity_lag_1,activity_lag_2,activity_lag_3,activity_lag_4,activity_lag_5
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231,-0.2,1.000000,1.000000,1.000000,0.00,16.395500,19.76160,0.0000,0.0,8.066,...,141.854550,92.197711,96.959106,88.374311,59.147956,0.027136,0.054654,0.129498,0.138811,0.104035
1232,0.4,0.000000,0.333333,0.250000,0.00,18.078550,19.76160,0.0000,0.0,8.066,...,113.607750,151.376919,81.583167,157.610275,189.994972,0.012704,0.115201,0.161470,0.014286,0.020587
1233,0.5,0.666667,1.000000,1.000000,0.80,16.395500,19.76160,0.0000,0.0,0.000,...,127.306139,90.136279,54.919488,185.398568,55.417750,0.098223,0.064399,0.002735,0.045963,0.011448
1234,-0.4,-0.200000,1.000000,1.000000,1.00,15.360083,16.39550,19.7616,0.0,0.000,...,136.143300,141.854550,92.197711,96.959106,88.374311,0.079299,0.027136,0.054654,0.129498,0.138811


In [21]:
# Calculate the split index
split_idx = int(len(X) * 0.8)  # 80% for training

# Split the data
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
tscv = TimeSeriesSplit(n_splits=5)
model = xgb.XGBRegressor()

# Set up hyperparameter distribution
param_dist = {
    'max_depth': randint(3, 10),  # Random integers between 3 and 10
    'learning_rate': uniform(0.01, 0.2),  # Uniform distribution between 0.01 and 0.21 (0.01 + 0.2)
    'n_estimators': randint(50, 300),  # Random integers between 50 and 300
    'subsample': uniform(0.6, 0.4),  # Uniform distribution between 0.6 and 1.0 (0.6 + 0.4)
    'colsample_bytree': uniform(0.6, 0.4)  # Uniform distribution between 0.6 and 1.0
}

# Configure RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=100, 
                                   cv=tscv, scoring='neg_mean_squared_error', random_state=42, verbose=1, n_jobs=-1)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate on test data
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: ", mse)

print("Best parameters found: ", random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
MSE:  0.46415740417465473
Best parameters found:  {'colsample_bytree': 0.6122000999756197, 'learning_rate': 0.017469637749842885, 'max_depth': 8, 'n_estimators': 64, 'subsample': 0.7440762565645052}


Output 1:  
tscv = TimeSeriesSplit(n_splits=3)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
MSE:  0.46415740417465473
Best parameters found:  {'colsample_bytree': 0.6122000999756197, 'learning_rate': 0.017469637749842885, 'max_depth': 8, 'n_estimators': 64, 'subsample': 0.7440762565645052}

Output 2:
Fitting 5 folds for each of 100 candidates, totalling 500 fits
MSE:  0.46415740417465473
Best parameters found:  {'colsample_bytree': 0.6122000999756197, 'learning_rate': 0.017469637749842885, 'max_depth': 8, 'n_estimators': 64, 'subsample': 0.7440762565645052}