In [1]:
!pip install numpy pandas scikit-learn xgboost



In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform

In [5]:
features = ['circumplex.valence','appCat.weather','circumplex.arousal','appCat.entertainment',
 'appCat.travel','appCat.utilities','appCat.builtin','appCat.communication','sms','screen','activity']

df = pd.read_csv('data/data_imputed.csv', index_col=['id', 'time'])
df.reset_index(inplace=True)
df['time'] = pd.to_datetime(df['time'])

# Shift the 'mood' column up to represent the next day's mood
df['mood_next_day'] = df.groupby('id')['mood'].shift(-1)

# Drop rows with any missing target value
df.dropna(subset=['mood_next_day'], inplace=True)

# Define features and target
X = df.drop(['mood_next_day', 'id', 'time', 'mood'], axis=1)  # Drop non-feature columns and the target
X = X[features]
y = df['mood_next_day']

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
model = xgb.XGBRegressor()

# Set up hyperparameter distribution
param_dist = {
    'max_depth': randint(3, 10),  # Random integers between 3 and 10
    'learning_rate': uniform(0.01, 0.2),  # Uniform distribution between 0.01 and 0.21 (0.01 + 0.2)
    'n_estimators': randint(50, 300),  # Random integers between 50 and 300
    'subsample': uniform(0.6, 0.4),  # Uniform distribution between 0.6 and 1.0 (0.6 + 0.4)
    'colsample_bytree': uniform(0.6, 0.4)  # Uniform distribution between 0.6 and 1.0
}

# Configure RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=100, cv=3, scoring='neg_mean_squared_error', random_state=42, verbose=1)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate on test data (optional)
from sklearn.metrics import mean_squared_error
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: ", mse)

print("Best parameters found: ", random_search.best_params_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
MSE:  0.5734243774467896
Best parameters found:  {'colsample_bytree': 0.7272013899887455, 'learning_rate': 0.03201038490553535, 'max_depth': 3, 'n_estimators': 76, 'subsample': 0.7708431154505025}
