## Load data

In [15]:
pip install tqdm

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting tqdm
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m483.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m644.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tqdm
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV
import pickle
import warnings
from tqdm import tqdm

# Filter out annoying warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [10]:
# datesets are created in Preprocessing
with open('merged_df_8.pkl', 'rb') as file:
    df8 = pickle.load(file)
with open('merged_df_7.pkl', 'rb') as file:
    df7 = pickle.load(file)
with open('merged_df_6.pkl', 'rb') as file:
    df6 = pickle.load(file)

In [11]:
# create datasets for different time buckets
def createdataset(df, timebucket):
    if timebucket == 1:
        df_copy = df.copy()
        df_copy['time_bucket'] = df_copy['hour_of_day']
        df_copy.drop(columns=['hour_of_day', 'date'], inplace = True)
        return df_copy
        
    # Create a copy of the dataframe
    df_copy = df.copy()
    # Add the time bucket column
    df_copy['time_bucket'] = pd.cut(df_copy['hour_of_day'], bins=range(-1, 25, timebucket), labels=False)
    df_copy = df_copy.drop(columns= ['hour_of_day'])
    df_copy['time_bucket'] = df_copy['time_bucket'].astype(int)
    
    # Take mean of weather data by time bucket
    df_copy['temperature'] = df_copy.groupby(['time_bucket', 'date'])['temperature'].transform('mean')
    df_copy['dew_point'] = df_copy.groupby(['time_bucket', 'date'])['dew_point'].transform('mean')
    df_copy['humidity'] = df_copy.groupby(['time_bucket', 'date'])['humidity'].transform('mean')
    df_copy['wind_speed'] = df_copy.groupby(['time_bucket', 'date'])['wind_speed'].transform('mean')
    df_copy['wind_gust'] = df_copy.groupby(['time_bucket', 'date'])['wind_gust'].transform('mean')
    df_copy['pressure'] = df_copy.groupby(['time_bucket', 'date'])['pressure'].transform('mean')
    df_copy['precipitation_rate'] = df_copy.groupby(['time_bucket', 'date'])['precipitation_rate'].transform('mean')
    # Aggregate the demand
    df_copy['demand'] = df_copy.groupby(['date', 'time_bucket', 'hex_id'])['demand'].transform('sum')
    df_copy = df_copy.drop_duplicates(subset=['hex_id', 'date', 'time_bucket'])
    df_copy.drop(columns=['date'], inplace = True)
    
    return df_copy


## Training
### Support Vector Regression without kernel

In [21]:
def trainsimple(X, y):
    # 1) train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 2) scale and encoder
    column_transformer = ColumnTransformer(
        transformers=[
            ('hex', OneHotEncoder(handle_unknown='ignore'), ['hex_id']),
            ('scale', StandardScaler(), X.columns.difference(['hex_id']))
        ],
        remainder='drop'
    )
    # 3) Create the full pipeline with preprocessing and SVR
    pipeline = Pipeline([
        ('preprocessor', column_transformer),
        ('svr', LinearSVR()) #keeping all the default values
    ])
    
    # 4) Fit model
    pipeline.fit(X_train, y_train)
    
    # 5) Predict on the test set using the pipeline
    y_pred = pipeline.predict(X_test)
    
    # 6) Evaluate the model
    mse_value = mean_squared_error(y_test, y_pred)  # Mean Squared Error
    r2_value = r2_score(y_test, y_pred)  # R-squared
    mae_value = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error

    print(f'Mean Squared Error (MSE): {mse_value:.4f}')
    print(f'R-squared (R2): {r2_value:.4f}')
    print(f'Mean Absolute Error (MAE): {mae_value:.4f}')

    return y_pred, y_test, r2_value, mse_value, mae_value

In [22]:
pred_nk = {}
true_nk = {}
mse_nk = {}
mae_nk = {}
r2_nk = {}

In [23]:
warnings.filterwarnings('ignore', 'Solver terminated early.*')

iterations = 1000
splitratio = 0.2

kernel = ['noKernel'] 

total_iterations = len([24, 6, 2, 1]) * len([6, 7, 8])

# Initialize tqdm progress bar
with tqdm(total=total_iterations, desc="Processing") as pbar:
  for time_bucket in [24,6,2,1]: 
      for resolution in [6,7,8]:
            columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
            print(columnname)
            if resolution == 6:
                  taxi_df = createdataset(df6, timebucket=time_bucket)
            elif resolution == 7:
                  taxi_df = createdataset(df7, timebucket=time_bucket)
            else:
                  taxi_df = createdataset(df8, timebucket=time_bucket)
    
            y = taxi_df['demand']
            X = taxi_df.drop(['demand'], axis=1)
    
            y_p, y_t, r2_v, mse_v, mae_v= trainsimple(X, y)

            pred_nk[columnname] = y_p
            true_nk[columnname] = y_t
            mse_nk[columnname ]= mse_v
            mae_nk[columnname] = mae_v
            r2_nk[columnname] = r2_v
    
            # Update progress bar
            pbar.update(1)

Processing:   0%|                                        | 0/12 [00:00<?, ?it/s]

h:24_res:6(_['noKernel'])


Processing:   8%|██▋                             | 1/12 [00:00<00:04,  2.27it/s]

Mean Squared Error (MSE): 3580184.3013
R-squared (R2): 0.0949
Mean Absolute Error (MAE): 252.3953
h:24_res:7(_['noKernel'])


Processing:  17%|█████▎                          | 2/12 [00:01<00:08,  1.16it/s]

Mean Squared Error (MSE): 591144.1725
R-squared (R2): 0.0905
Mean Absolute Error (MAE): 88.4934
h:24_res:8(_['noKernel'])


Processing:  25%|████████                        | 3/12 [00:02<00:09,  1.06s/it]

Mean Squared Error (MSE): 210487.2310
R-squared (R2): 0.0479
Mean Absolute Error (MAE): 71.0369
h:6_res:6(_['noKernel'])


Processing:  33%|██████████▋                     | 4/12 [00:03<00:06,  1.15it/s]

Mean Squared Error (MSE): 416904.5925
R-squared (R2): 0.0633
Mean Absolute Error (MAE): 82.1194
h:6_res:7(_['noKernel'])


Processing:  42%|█████████████▎                  | 5/12 [00:06<00:11,  1.64s/it]

Mean Squared Error (MSE): 69599.3025
R-squared (R2): 0.0644
Mean Absolute Error (MAE): 31.1490
h:6_res:8(_['noKernel'])


Processing:  50%|████████████████                | 6/12 [00:11<00:16,  2.80s/it]

Mean Squared Error (MSE): 21377.0985
R-squared (R2): 0.0601
Mean Absolute Error (MAE): 23.6118
h:2_res:6(_['noKernel'])


Processing:  58%|██████████████████▋             | 7/12 [00:15<00:15,  3.20s/it]

Mean Squared Error (MSE): 59545.0004
R-squared (R2): 0.0459
Mean Absolute Error (MAE): 33.3412
h:2_res:7(_['noKernel'])


Processing:  67%|█████████████████████▎          | 8/12 [00:32<00:30,  7.62s/it]

Mean Squared Error (MSE): 9880.4648
R-squared (R2): 0.0539
Mean Absolute Error (MAE): 13.0999
h:2_res:8(_['noKernel'])


Processing:  75%|████████████████████████        | 9/12 [00:54<00:36, 12.19s/it]

Mean Squared Error (MSE): 3950.5418
R-squared (R2): 0.0565
Mean Absolute Error (MAE): 11.0922
h:1_res:6(_['noKernel'])


Processing:  83%|█████████████████████████▊     | 10/12 [01:02<00:21, 10.84s/it]

Mean Squared Error (MSE): 14854.9475
R-squared (R2): 0.0486
Mean Absolute Error (MAE): 17.8951
h:1_res:7(_['noKernel'])


Processing:  92%|████████████████████████████▍  | 11/12 [01:43<00:19, 19.98s/it]

Mean Squared Error (MSE): 3615.9651
R-squared (R2): 0.0547
Mean Absolute Error (MAE): 8.7071
h:1_res:8(_['noKernel'])


Processing: 100%|███████████████████████████████| 12/12 [02:28<00:00, 12.35s/it]

Mean Squared Error (MSE): 1257.4846
R-squared (R2): 0.0605
Mean Absolute Error (MAE): 7.2505





In [26]:
dataNK = []
for time_bucket in [24,6,2,1]:
    for resolution in [6,7,8]:
        columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
        r2_score = r2_nk[columnname]
        mse = mse_nk[columnname]
        mae=mae_nk[columnname]
        test_mean = np.mean(pred_nk[columnname])  # Calculate mean
        true_mean = np.mean(true_nk[columnname])
        test_var = np.var(pred_nk[columnname])  # Calculate var
        true_var = np.var(true_nk[columnname])
        dataNK.append({
                    'Kernel': kernel,
                    'Hexagon resolution': f'H3-{resolution}',
                    'Time resolution': time_bucket,
                    'R2-score': r2_score,
                    'MSE': mse,
                    'MAE':mae,
                    'Test_mean': test_mean,
                    'True_mean': true_mean,
                    'Test_variance':test_var,
                    'True_variance':true_var
                })

# Create the DataFrame
svmevalNK = pd.DataFrame(dataNK)


# Display the DataFrame
svmevalNK

Unnamed: 0,Kernel,Hexagon resolution,Time resolution,R2-score,MSE,MAE,Test_mean,True_mean,Test_variance,True_variance
0,[noKernel],H3-6,24,0.094888,3580184.0,252.395294,135.835221,350.254454,60142.589116,3955515.0
1,[noKernel],H3-7,24,0.090503,591144.2,88.493354,46.322433,117.614672,9787.117392,649968.5
2,[noKernel],H3-8,24,0.047897,210487.2,71.03686,32.155239,92.692197,2160.652756,221076.0
3,[noKernel],H3-6,6,0.063284,416904.6,82.119443,37.401069,98.230137,4351.854532,445070.2
4,[noKernel],H3-7,6,0.064433,69599.3,31.148972,13.614486,37.514489,747.405788,74392.65
5,[noKernel],H3-8,6,0.060074,21377.1,23.611751,12.495655,29.32911,487.801215,22743.39
6,[noKernel],H3-6,2,0.045902,59545.0,33.341213,13.746483,39.412219,404.031391,62409.72
7,[noKernel],H3-7,2,0.053928,9880.465,13.099885,6.131071,15.9081,91.581956,10443.67
8,[noKernel],H3-8,2,0.056499,3950.542,11.092191,6.182696,13.872935,78.689863,4187.111
9,[noKernel],H3-6,1,0.048621,14854.95,17.895093,8.216803,21.582058,121.415742,15614.13


### Complex Models with different kernels


In [19]:
def trainsvmmodel(X, y, maxiter, splitratio, kernel):
    
    # 1) train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=splitratio)

    # 2) scale and encoder
    column_transformer = ColumnTransformer(
        transformers=[
            ('hex', OneHotEncoder(handle_unknown='ignore'), ['hex_id']),
            ('scale', StandardScaler(), X.columns.difference(['hex_id']))
        ],
        remainder='drop'
    )
    
    # 3) Create the full pipeline with preprocessing and SVR
    pipeline = Pipeline([
        ('preprocessor', column_transformer),
        ('svr', SVR(kernel=kernel, max_iter=maxiter))
    ])

    # 3) Grid search for best parameters
    if kernel == 'linear':
        param_grid = {
            'svr__C': [0.1, 1, 10, 100, 1000],
            'svr__epsilon': [0.0001, 0.001, 0.1, 0.2, 0.5, 0.9]
        }
    else:
        param_grid = {
            'svr__C': [0.1, 1, 10, 100, 1000],
            'svr__gamma': ['auto', 'scale'],
            'svr__epsilon': [0.0001, 0.001, 0.1, 0.2, 0.5, 0.9]
        }
    
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    best_params =grid_search.best_params_

    # 4) best model with optimal parameters
    best_model = grid_search.best_estimator_
    
    # 5) evaluate model 
    y_pred = best_model.predict(X_test)
    mse_value = mean_squared_error(y_test, y_pred)  # Mean Squared Error
    r2_value = r2_score(y_test, y_pred)  # R-squared
    mae_value = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error

    print(f'Mean Squared Error (MSE): {mse_value:.4f}')
    print(f'R-squared (R2): {r2_value:.4f}')
    print(f'Mean Absolute Error (MAE): {mae_value:.4f}')


    return y_pred, y_test, r2_value, mse_value, mae_value

In [8]:
pred_rbf = {}
true_rbf = {}
mse_rbf = {}
mae_rbf = {}
r2_rbf = {}

pred_lin = {}
true_lin = {}
mse_lin = {}
mae_lin = {}
r2_lin = {}

pred_poly = {}
true_poly = {}
mse_poly = {}
mae_poly = {}
r2_poly = {}

In [20]:
warnings.filterwarnings('ignore', 'Solver terminated early.*')

iterations = 1000
splitratio = 0.2

kernels = ['rbf', 'linear', 'poly'] 

total_iterations = len(kernels) * len([24, 6, 2, 1]) * len([6, 7, 8])
count = 1 
# Initialize tqdm progress bar
with tqdm(total=total_iterations, desc="Processing") as pbar:
  for kernel in kernels:
      for time_bucket in [24,6,2,1]: 
          for resolution in [6,7,8]: 
                columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
                print(columnname)
                if resolution == 6:
                      taxi_df = createdataset(df6, timebucket=time_bucket)
                elif resolution == 7:
                      taxi_df = createdataset(df7, timebucket=time_bucket)
                else:
                      taxi_df = createdataset(df8, timebucket=time_bucket)
        
                y = taxi_df['demand']
                X = taxi_df.drop(['demand'], axis=1)
        
                y_p, y_t, r2_v, mse_v, mae_v= trainsvmmodel(X, y, maxiter=iterations, splitratio=splitratio, kernel=kernel)
        
                if kernel == 'rbf':
                      pred_rbf[columnname] = y_p
                      true_rbf[columnname] = y_t
                      mse_rbf[columnname ]= mse_v
                      mae_rbf[columnname] = mae_v
                      r2_rbf[columnname] = r2_v
                elif kernel == 'linear':
                      pred_lin[columnname] = y_p
                      true_lin[columnname] = y_t
                      mse_lin[columnname] = mse_v
                      mae_lin[columnname] = mae_v
                      r2_lin[columnname] = r2_v
                elif kernel == 'poly':
                      pred_poly[columnname] = y_p
                      true_poly[columnname] = y_t
                      mse_poly[columnname] = mse_v
                      mae_poly[columnname] = mae_v
                      r2_poly[columnname] = r2_v
                # Update progress bar
                pbar.update(1)

Processing:   0%|                                        | 0/36 [00:00<?, ?it/s]

h:1_res:7(_poly)


Processing:  97%|█████████████████████████▎| 35/36 [10:33:10<18:05, 1085.44s/it]

Mean Squared Error (MSE): 27589.6947
R-squared (R2): -6.5134
Mean Absolute Error (MAE): 162.5069
h:1_res:8(_poly)


Processing: 100%|██████████████████████████| 36/36 [13:21:42<00:00, 1336.17s/it]

Mean Squared Error (MSE): 4239.1603
R-squared (R2): -2.3724
Mean Absolute Error (MAE): 60.8165





In [21]:
# Initialize a list to store data for the DataFrame
data = []

kernels = ['rbf', 'linear','poly']

# Loop through the kernels, time and resolution
for kernel in kernels:
    for time_bucket in [24,6,2,1]:
            for resolution in [6,7,8]:
                columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
                if kernel=='linear':
                    r2_score = r2_lin[columnname]
                    mse = mse_lin[columnname]
                    mae=mae_lin[columnname]
                    test_mean = np.mean(pred_lin[columnname])  # Calculate mean
                    true_mean = np.mean(true_lin[columnname])
                    test_var = np.var(pred_lin[columnname])  # Calculate var
                    true_var = np.var(true_lin[columnname])
                elif kernel=='rbf':
                    r2_score = r2_rbf[columnname]
                    mse = mse_rbf[columnname]
                    mae=mae_rbf[columnname]
                    test_mean = np.mean(pred_rbf[columnname])  # Calculate mean
                    true_mean = np.mean(true_rbf[columnname])
                    test_var = np.var(pred_rbf[columnname])  # Calculate var
                    true_var = np.var(true_rbf[columnname])
                elif kernel=='poly':
                    r2_score = r2_poly[columnname]
                    mse = mse_poly[columnname]
                    mae=mae_poly[columnname]
                    test_mean = np.mean(pred_poly[columnname])  # Calculate mean
                    true_mean = np.mean(true_poly[columnname])
                    test_var = np.var(pred_poly[columnname])  # Calculate var
                    true_var = np.var(true_poly[columnname])
            
                data.append({
                    'Kernel': kernel,
                    'Hexagon resolution': f'H3-{resolution}',
                    'Time resolution': time_bucket,
                    'R2-score': r2_score,
                    'MSE': mse,
                    'MAE':mae,
                    'Test_mean': test_mean,
                    'True_mean': true_mean,
                    'Test_variance':test_var,
                    'True_variance':true_var
                })

# Create the DataFrame
svmeval = pd.DataFrame(data)


# Display the DataFrame
svmeval

Unnamed: 0,Kernel,Hexagon resolution,Time resolution,R2-score,MSE,MAE,Test_mean,True_mean,Test_variance,True_variance
0,poly,H3-7,1,-6.513447,27589.694722,162.506889,167.409618,10.4891,242.593875,3672.042029
1,poly,H3-8,1,-2.372423,4239.160256,60.816532,65.542544,9.053392,84.609527,1257.007461


In [23]:
sveval = pd.read_csv('svm_evaluation_final.csv')
# Display the combined DataFrame
sveval

Unnamed: 0,Kernel,Hexagon resolution,Time resolution,R2-score,MSE,MAE,Test_mean,True_mean,Test_variance,True_variance
0,poly,H3-7,1,-6.513447,27589.69,162.506889,167.409618,10.4891,242.5939,3672.042
1,poly,H3-8,1,-2.372423,4239.16,60.816532,65.542544,9.053392,84.60953,1257.007
2,linear,H3-6,1,-0.441903,22426.46,86.672924,83.021627,21.15861,5846.758,15553.37
3,linear,H3-7,1,-8.208999,35499.98,177.836046,182.865346,10.866327,3100.374,3854.923
4,linear,H3-8,1,-2.840778,5185.903,58.367503,63.342441,9.292351,1451.877,1350.222
5,poly,H3-6,24,0.578056,1418193.0,625.32192,-128.633608,310.205457,3427125.0,3361090.0
6,poly,H3-7,24,-0.012481,694712.5,639.309404,-501.901919,117.241571,157648.3,686148.9
7,poly,H3-8,24,0.170739,147683.2,96.471119,68.040611,83.536416,3911.769,178090.0
8,poly,H3-6,6,-0.003124,414059.5,455.955917,-347.090046,93.017846,91064.71,412769.9
9,poly,H3-7,6,-0.560551,111930.2,241.756263,-204.508138,36.465978,3097.041,71724.8
