In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.model_selection import train_test_split

# Loading and PreProcessing Data

In [2]:
df = pd.read_csv("Daily_attendance.csv")

In [3]:
df = df.drop("snow_1h", axis = 1)

In [4]:
datetime.strptime(df.iloc[0,0], '%m/%d/%Y').year

2019

In [5]:
df["Month"] = 0 
df["DOW"] = 0 
df["Year"] = 0

In [6]:
for i in range(0, len(df)):
    date = datetime.strptime(df.iloc[i,0], '%m/%d/%Y')
    df.loc[i, "Month"] = date.month
    df.loc[i, "DOW"] = date.weekday()
    df.loc[i, "Year"] = date.year

In [7]:
df["Month"].unique()

array([ 5,  6,  7,  8,  9, 10], dtype=int64)

In [8]:
df["DOW"].unique()

array([2, 3, 4, 5, 6, 0, 1], dtype=int64)

In [9]:
# month_part = 360/6
# dow_part = 360/7

In [10]:
# # df["DOW_cos"] = 0 
# # df["DOW_sin"] =0 
# df["Month_cos"] = 0
# df["Month_sin"]  = 0

In [11]:
# for i in range(0, len(df)):
#     month_mult = df.loc[i, "Month"] - 4
# #     dow_mult = df.loc[i, "DOW"] + 1
    
# #     df.loc[i, "DOW_cos"] = math.cos(dow_part*dow_mult)
# #     df.loc[i, "DOW_sin"] = math.sin(dow_part*dow_mult)

#     df.loc[i, "Month_cos"] = math.cos(month_part*month_mult)
#     df.loc[i, "Month_sin"] = math.sin(month_part*month_mult)

In [12]:
#into numeric binary:
for i in range(0, len(df)):
    if df.loc[i, "school_holiday_flag"] == "False":
        df.loc[i, "school_holiday_flag"] = 0 
    else:
        df.loc[i, "school_holiday_flag"] = 1
    
    if df.loc[i, "federal_holiday_flag"] == "False":
        df.loc[i, "federal_holiday_flag"] = 0 
    else:
        df.loc[i, "federal_holiday_flag"] = 1    

In [13]:
df["Tuesday"] = 0
df["Wednesday"] = 0
df["Thursday"] =0 
df["Friday"] =0 
df["Saturday"] =0 
df["Sunday"] = 0

In [14]:
for i in range(0, len(df)):
    if df.loc[i, "DOW"] == 0:
        df.loc[i, "Sunday"] =1
    elif df.loc[i, "DOW"] ==2:
        df.loc[i, "Tuesday"] = 1
    elif df.loc[i, "DOW"] ==3:
        df.loc[i, "Wednesday"] = 1
    elif df.loc[i, "DOW"] ==4:
        df.loc[i, "Thursday"] = 1
    elif df.loc[i, "DOW"] ==5:
        df.loc[i, "Friday"] = 1
    elif df.loc[i, "DOW"] ==6:
        df.loc[i, "Tuesday"] = 1

In [15]:
df["June"] = 0
df["July"] = 0
df["August"] =0 
df["September"] =0 
df["October"] =0 

In [16]:
for i in range(0, len(df)):
    if df.loc[i, "Month"] == 6:
        df.loc[i, "June"] =1
    elif df.loc[i, "Month"] ==7:
        df.loc[i, "July"] = 1
    elif df.loc[i, "Month"] ==8:
        df.loc[i, "August"] = 1
    elif df.loc[i, "Month"] ==9:
        df.loc[i, "September"] = 1
    elif df.loc[i, "Month"] ==10:
        df.loc[i, "October"] = 1

In [17]:
#Find and replace NaN values:
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

In [18]:
rows_with_NaN

Unnamed: 0,Full_Date,value,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_gust,rain_1h,...,Wednesday,Thursday,Friday,Saturday,Sunday,June,July,August,September,October
59,6/29/2019,1494,58.19,50.49,72.43,1014.583333,82.541667,0.0,,1.0,...,0,0,1,0,0,1,0,0,0,0
334,6/28/2021,1290,72.325,61.09,90.68,1017.083333,86.833333,19.807917,,0.0,...,0,0,0,0,1,1,0,0,0,0
336,6/30/2021,1167,74.545,60.51,90.09,1012.875,84.291667,10.487083,,0.0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
#replace with average wind gust
df.loc[59, "wind_gust"] = np.mean(df["wind_gust"])
df.loc[334, "wind_gust"] = np.mean(df["wind_gust"])
df.loc[336, "wind_gust"] = np.mean(df["wind_gust"])

In [107]:
#normalization:
# df["normalized_value"] = df["value"]/np.linalg.norm(df["value"], ord=np.inf, axis=0, keepdims=True)

In [110]:
# #scaling:
# target_max = max(df["value"])
# target_min =  min(df["value"])
# df["scaled_value"] = [(i- target_min)/(target_max - target_min) for i in df["value"]]

### Splitting Data

In [20]:
columns_for_modeling = ['temp', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_gust', 'rain_1h',
       'federal_holiday_flag', 'school_holiday_flag', 'Year',
    "Sunday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
                       "June", "July", "August","September", "October"]

In [23]:
x_train, x_test, y_train, y_test = train_test_split(df[columns_for_modeling], df["value"], test_size=0.33)

## SVM:

In [24]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [25]:
regr = make_pipeline(StandardScaler(), SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1))

In [26]:
regr.fit(x_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(C=100, coef0=1, gamma='auto', kernel='poly'))])

In [27]:
pred = regr.predict(x_test)

In [28]:
mean_squared_error(y_test, pred, squared=False)

328.52689983137947

In [29]:
svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
svr_rbf.fit(x_train, y_train)
pred_rbf = svr_rbf.predict(x_test)
mean_squared_error(y_test, pred_rbf, squared=False)

497.8776970342663

In [30]:
svr_lin = SVR(kernel="linear", C=100, gamma="auto")
svr_lin.fit(x_train, y_train)
pred_lin = svr_lin.predict(x_test)
mean_squared_error(y_test, pred_lin, squared=False)

418.11008552803514