## Load data

In [10]:
with open('merged_df_8.pkl', 'rb') as file:
    df8 = pickle.load(file)
with open('merged_df_7.pkl', 'rb') as file:
    df7 = pickle.load(file)
with open('merged_df_6.pkl', 'rb') as file:
    df6 = pickle.load(file)

In [11]:
def createdataset(df, timebucket):
    # Create a copy of the dataframe
    df_copy = df.copy()
    # Add the time bucket column
    df_copy['time_bucket'] = pd.cut(df_copy['hour_of_day'], bins=range(-1, 25, timebucket), labels=False)
    # Take mean of weather data by time bucket
    df_copy['temperature'] = df_copy.groupby(['time_bucket', 'date'])['temperature'].transform('mean')
    df_copy['dew_point'] = df_copy.groupby(['time_bucket', 'date'])['dew_point'].transform('mean')
    df_copy['humidity'] = df_copy.groupby(['time_bucket', 'date'])['humidity'].transform('mean')
    df_copy['wind_speed'] = df_copy.groupby(['time_bucket', 'date'])['wind_speed'].transform('mean')
    df_copy['wind_gust'] = df_copy.groupby(['time_bucket', 'date'])['wind_gust'].transform('mean')
    df_copy['pressure'] = df_copy.groupby(['time_bucket', 'date'])['pressure'].transform('mean')
    df_copy['precipitation_rate'] = df_copy.groupby(['time_bucket', 'date'])['precipitation_rate'].transform('mean')
    # Aggregate the demand
    demand = df_copy.groupby(['date', 'time_bucket', 'hex_id']).size().reset_index(name='demand')
    df_copy.drop(columns=["demand"], inplace = True)
    # Merge the demand back into the original DataFrame, dropping duplicates
    df_copy = df_copy.merge(demand, on=['hex_id', 'date', 'time_bucket'], how='left')
    df_copy = df_copy.drop_duplicates(subset=['hex_id', 'date', 'time_bucket'])
    df_copy.drop(columns=['hour_of_day'])
    s
    return df_copy

## Training

In [9]:
def trainsvmmodel(X, y, maxiter, splitratio, kernel):
    
    # 1) train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=splitratio)

    # 2) scale and encoder
    column_transformer = ColumnTransformer(
        transformers=[
            ('hex', OneHotEncoder(handle_unknown='ignore'), ['hex_id']),
            ('scale', StandardScaler(), X.columns.difference(['hex_id', 'date']))
        ],
        remainder='drop'
    )
    
    svm_reg = make_pipeline(column_transformer, SVR(kernel=kernel, max_iter = maxiter))

    # 3) Grid search for best parameters
    if kernel == 'linear':
        param_grid = {
            'svr__C': [0.1, 1, 10, 100, 1000],
            'svr__epsilon': [0.0001, 0.001, 0.1, 0.2, 0.5, 0.9]
        }
    else:
        param_grid = {
            'svr__C': [0.1, 1, 10, 100, 1000],
            'svr__gamma': ['auto', 'scale'],
            'svr__epsilon': [0.0001, 0.001, 0.1, 0.2, 0.5, 0.9]
        }
    
    
    grid_search = GridSearchCV(svm_reg, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    best_params =grid_search.best_params_

    # 4) best model with optimal parameters
    if kernel=='linear':
        best_model = SVR(kernel=kernel,epsilon=best_params['svr__epsilon'], C=best_params['svr__C'], max_iter=maxiter)
    else:
        best_model = SVR(kernel=kernel,gamma=best_params['svr__gamma'],epsilon=best_params['svr__epsilon'], C=best_params['svr__C'], max_iter=maxiter)

    # 5) train best model
    best_model.fit(X_train_scaled, y_train)
    
    # 6) evaluate model 
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
    r2 = r2_score(y_test, y_pred)  # R-squared
    mae = mean_absolute_error(y_test, y_pred) # Mean Absolute Error

    print(f'Mean Squared Error (MSE): {mse:.4f}')
    print(f'R-squared (R2): {r2:.4f}')
    print(f'Mean Absolute Error (MAE): {mae:.4f}')


    return y_pred, y_test, r2, mse, mae, best_params

In [None]:
iterations = 1000

pred = {}
true = {}
mse = {}
mae = {}
r2 = {}
bp = {}

pred_lin = {}
true_lin = {}
mse_lin = {}
mae_lin = {}
r2_lin = {}
bp_lin = {}

pred_poly = {}
true_poly = {}
mse_poly = {}
mae_poly = {}
r2_poly = {}
bp_poly = {}

kernels = ['rbf', 'lin', 'poly']

for kernel in kernels:
    for time_bucket in [24,6,2,1]: 
        for resolution in [6,7,8]:
            columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
            print(columnname)
            if resolution == 6:
                taxi_df = createdataset(df6, timebucket=time_bucket)
            elif resolution == 7:
                taxi_df = createdataset(df7, timebucket=time_bucket)
            else:
                taxi_df = createdataset(df8, timebucket=time_bucket)
                
            y = taxi_df['demand']
            X = taxi_f.drop(['date', 'time_bucket'], axis=1)
            
            y_p, y_t, r2_v, mse_v, mae_v, best_params_v = trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel=kernel)

            if kernel == 'rbf':
                pred[columnname] = y_p
                true[columnname] = y_t
                mse[columnname ]= mse_v
                mae[columnname] = mae_v
                r2[columnname] = r2_v
                bp[columnname] = best_params_v
            elif kernel == 'lin':
                pred_lin[columnname] = y_p
                true_lin[columnname] = y_t
                mse_lin[columnname] = mse
                mae_lin[columnname] = mae
                r2_lin[columnname] = r2_v
                bp_lin[columnname] = best_params_v
            elif kernel == 'poly':
                pred_poly[columnname] = y_p
                true_poly[columnname] = y_t
                mse_poly[columnname] = mse
                mae_poly[columnname] = mae
                r2_poly[columnname] = r2_v
                bp_poly[columnname] = best_params_v
        

In [None]:
# Initialize a list to store data for the DataFrame
data = []

# Loop through the kernels
for kernel in kernels:
    # Loop through time buckets
    for time_bucket in [24, 6, 2, 1]:
        for resolution in [6, 7, 8]:
            columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
            if kernel=='lin':
                r2_score = r2_lin[columnname]
                mse = mse_lin[columnname]
                mae=mae_lin[columnname]
                test_mean = np.mean(pred_lin[columnname])  # Calculate the mean
                true_mean = np.mean(true_lin[columnname])
                test_var = np.var(pred_lin[columnname])  # Calculate the mean
                true_var = np.var(true_lin[columnname])
            elif kernel=='rbf':
                r2_score = r2[columnname]
                mse = mse[columnname]
                mae=mae[columnname]
                test_mean = np.mean(pred[columnname])  # Calculate the mean
                true_mean = np.mean(true[columnname])
                test_var = np.var(pred[columnname])  # Calculate the mean
                true_var = np.var(true[columnname])
            elif kernel=='poly':
                r2_score = r2_poly[columnname]
                mse = mse_poly[columnname]
                mae=mae_poly[columnname]
                test_mean = np.mean(pred_poly[columnname])  # Calculate the mean
                true_mean = np.mean(true_poly[columnname])
                test_var = np.var(pred_poly[columnname])  # Calculate the mean
                true_var = np.var(true_poly[columnname])
            
            data.append({
                'Kernel': kernel,
                'Hexagon resolution': f'H3-{resolution}',
                'Timebucket': time_bucket,
                'R2-score': r2_score,
                'MSE': mse,
                'MAE':mae,
                'Test_mean': test_mean,
                'Truth_mean': true_mean,
                'Test_variance':test_var,
                'Truth_var':true_var
            })

# Create the DataFrame
svmresults = pd.DataFrame(data)


# Display the DataFrame
svmresults