In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('/kaggle/input/crowdedness-at-the-campus-gym/data.csv')
data

Unnamed: 0,number_people,date,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour
0,37,2015-08-14 17:00:11-07:00,61211,4,0,0,71.76,0,0,8,17
1,45,2015-08-14 17:20:14-07:00,62414,4,0,0,71.76,0,0,8,17
2,40,2015-08-14 17:30:15-07:00,63015,4,0,0,71.76,0,0,8,17
3,44,2015-08-14 17:40:16-07:00,63616,4,0,0,71.76,0,0,8,17
4,45,2015-08-14 17:50:17-07:00,64217,4,0,0,71.76,0,0,8,17
...,...,...,...,...,...,...,...,...,...,...,...
62179,23,2017-03-18 18:42:28-07:00,67348,5,1,0,61.07,0,1,3,18
62180,21,2017-03-18 18:52:35-07:00,67955,5,1,0,61.07,0,1,3,18
62181,25,2017-03-18 19:02:40-07:00,68560,5,1,0,56.71,0,1,3,19
62182,18,2017-03-18 19:12:47-07:00,69167,5,1,0,56.71,0,1,3,19


In [18]:
def preprocess_inputs(df):
    df = df.copy()
    
    ## Extract date features
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['hour'] = df['date'].apply(lambda x: x.hour)
    df['minute'] = df['date'].apply(lambda x: x.minute)
    df = df.drop('date', axis = 1)
    
    ## Split the df into X and y
    y = df['number_people']
    X = df.drop('number_people', axis = 1)
    
    ## Train test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 1)
    
    ## Scale X ## Mean of zero and variance of 1
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [19]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)


In [20]:
X_train

Unnamed: 0,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,day,minute
23552,-0.997621,-0.989840,-0.623864,-0.049641,-0.565892,-0.292751,0.719964,-1.000469,-0.926864,-0.436112,-1.668369
3026,-0.278600,-1.491776,-0.623864,-0.049641,1.069425,-0.292751,0.719964,0.452734,-0.331144,-0.209158,1.227967
16668,0.837126,-1.491776,-0.623864,-0.049641,-1.043125,-0.292751,0.719964,-1.581751,0.860295,-1.684358,-0.509835
13838,0.762901,1.519844,1.602914,-0.049641,-0.833143,-0.292751,-1.388959,-1.872391,0.711366,-1.457404,1.227967
6459,1.357938,-1.491776,-0.623864,-0.049641,1.109194,-0.292751,0.719964,0.743375,1.307085,0.358226,1.227967
...,...,...,...,...,...,...,...,...,...,...,...
50057,-1.329483,1.017907,1.602914,-0.049641,0.547651,-0.292751,0.719964,0.743375,-1.373653,1.492995,0.996260
32511,1.308345,0.515970,-0.623864,-0.049641,0.429933,-0.292751,-1.388959,-0.419188,1.307085,0.925610,0.069433
5192,1.581108,-0.989840,-0.623864,-0.049641,0.631962,-0.292751,0.719964,0.743375,1.604945,-1.116974,-0.509835
12172,1.283424,0.014034,-0.623864,-0.049641,-1.035171,-0.292751,0.719964,1.324656,1.307085,0.131272,-0.509835


In [23]:
X_train.describe()

Unnamed: 0,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,day,minute
count,43528.0,43528.0,43528.0,43528.0,43528.0,43528.0,43528.0,43528.0,43528.0,43528.0,43528.0
mean,2.7260760000000003e-17,1.053294e-16,-6.953942000000001e-17,-9.631047e-18,-8.560205e-16,3.264762e-17,2.179228e-17,9.892228000000001e-17,1.043091e-16,3.3627050000000005e-17,-2.930124e-17
std,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011
min,-1.891004,-1.491776,-0.623864,-0.04964117,-3.24794,-0.2927505,-1.388959,-1.872391,-1.820443,-1.684358,-1.668369
25%,-0.7915195,-0.9898398,-0.623864,-0.04964117,-0.5595293,-0.2927505,-1.388959,-0.7098288,-0.7779337,-0.8900198,-0.7415414
50%,0.02534473,0.01403366,-0.623864,-0.04964117,-0.02980095,-0.2927505,0.7199639,0.1620932,-0.0332841,0.01779529,0.06943276
75%,0.8617983,1.017907,1.602914,-0.04964117,0.5921924,-0.2927505,0.7199639,0.7433745,0.8602954,0.8121335,0.7645535
max,1.679675,1.519844,1.602914,20.14457,4.551633,3.415877,0.7199639,1.324656,1.604945,1.719949,1.749308


In [24]:
models = {
    "Linear Regression (Ridge)": Ridge(),
    "Neural Network": MLPRegressor(),
    "Random Forest": RandomForestRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

Linear Regression (Ridge) trained.
Neural Network trained.
Random Forest trained.


In [32]:
def get_rmse(y_test, y_pred):
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    return rmse

def get_r2(y_test, y_pred):
    r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
    return r2


for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = get_rmse(y_test, y_pred)
    r2 = get_r2(y_test, y_pred)
    print(name + " RMSE: {:.2f}".format(rmse))
    print(name + " R_squared: {:.5f}".format(r2))
    print('\n')

Linear Regression (Ridge) RMSE: 16.04
Linear Regression (Ridge) R_squared: 0.50533


Neural Network RMSE: 12.39
Neural Network R_squared: 0.70481


Random Forest RMSE: 6.68
Random Forest R_squared: 0.91415


