Now we will revise the event prediction model we made earlier. There are few key points in my head that once implemented should be able to improve the model's accuracy by a great extent:
- The replacement of old taxi model with latest taxi&subway model
- the train test split Laura suggested
The format of the target feature will also be changed from normalized score to an absolute score to better display the difference of busyness in differen zone.

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
import calendar
import contextily as ctx
import glob
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
import pickle

In [2]:
#read and get the taxi zone
taxi_zones = pd.read_csv("C:/Users/user/Desktop/Summer Project/taxi_zones/taxi_zones.csv")
manhattan_zones = taxi_zones[taxi_zones['borough'] == 'Manhattan']

In [3]:
manhattan_zones

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
3,4,0.043567,MULTIPOLYGON (((-73.97177410965318 40.72582128...,0.000112,Alphabet City,4,Manhattan
9,24,0.047000,MULTIPOLYGON (((-73.95953658899997 40.79871852...,0.000061,Bloomingdale,24,Manhattan
12,12,0.036661,MULTIPOLYGON (((-74.01565756599994 40.70483308...,0.000042,Battery Park,12,Manhattan
13,13,0.050281,MULTIPOLYGON (((-74.01244109299991 40.71905767...,0.000149,Battery Park City,13,Manhattan
40,41,0.052793,MULTIPOLYGON (((-73.94773985499985 40.80959972...,0.000143,Central Harlem,41,Manhattan
...,...,...,...,...,...,...,...
246,244,0.080569,MULTIPOLYGON (((-73.94068822000003 40.85131543...,0.000360,Washington Heights South,244,Manhattan
248,246,0.069467,MULTIPOLYGON (((-74.00439976203513 40.76267135...,0.000281,West Chelsea/Hudson Yards,246,Manhattan
251,249,0.036384,MULTIPOLYGON (((-74.00250642399995 40.72901638...,0.000072,West Village,249,Manhattan
261,261,0.027120,MULTIPOLYGON (((-74.01332610899988 40.70503078...,0.000034,World Trade Center,261,Manhattan


In [4]:
manhattan_zones['LocationID']

3        4
9       24
12      12
13      13
40      41
      ... 
246    244
248    246
251    249
261    261
262    262
Name: LocationID, Length: 69, dtype: int64

In [5]:
columns_to_drop = ['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
                  'PULocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
                  'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge',
                  'airport_fee']

# Model Training & Testing

 Details will be explained for the first event below (Times Square Ball). Since the same logic applied to all 8 events so understanding 1 event is enough to know how the whole things run. Since majority of the process is the same as the previous version, only the changed/added section will be explained further.

# Thanksgiving Day Parade

## event day busyness

In [6]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-11.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-11-22")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
4147083,107,2018-11-22,11,10,Thursday
4147084,162,2018-11-22,11,10,Thursday
4147085,209,2018-11-22,11,10,Thursday
4147086,148,2018-11-22,11,11,Thursday
4147087,186,2018-11-22,11,11,Thursday
...,...,...,...,...,...
8152170,238,2018-11-22,11,22,Thursday
8152175,90,2018-11-22,11,22,Thursday
8152178,74,2018-11-22,11,22,Thursday
8152182,116,2018-11-22,11,22,Thursday


In [7]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [8]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [9]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [10]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,55
1,4,1,60
2,4,2,23
3,4,3,36
4,4,4,22
...,...,...,...
1579,263,19,204
1580,263,20,242
1581,263,21,248
1582,263,22,211


## baseline busyness

In [11]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-11.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-11-15")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-11-08")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,140,2018-11-15,11,10,Thursday
1,237,2018-11-15,11,10,Thursday
2,170,2018-11-15,11,10,Thursday
3,164,2018-11-15,11,11,Thursday
4,237,2018-11-15,11,14,Thursday
...,...,...,...,...,...
469298,209,2018-11-08,11,21,Thursday
469299,75,2018-11-08,11,21,Thursday
469300,230,2018-11-08,11,22,Thursday
469301,113,2018-11-08,11,22,Thursday


In [12]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [13]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [14]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,47
1,4,1,29
2,4,2,21
3,4,3,16
4,4,4,8
...,...,...,...
1579,263,19,350
1580,263,20,361
1581,263,21,373
1582,263,22,341


In [15]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,47,55
1,4,1,29,60
2,4,2,21,23
3,4,3,16,36
4,4,4,8,22
...,...,...,...,...
1579,263,19,350,204
1580,263,20,361,242
1581,263,21,373,248
1582,263,22,341,211


In [16]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,47,55,Night
1,4,1,29,60,Night
2,4,2,21,23,Night
3,4,3,16,36,Night
4,4,4,8,22,Night
...,...,...,...,...,...
1579,263,19,350,204,Evening
1580,263,20,361,242,Evening
1581,263,21,373,248,Evening
1582,263,22,341,211,Night


In [17]:
df_compare["Event_ID"]=1
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,1,4,0,Night,47,55
1,1,4,1,Night,29,60
2,1,4,2,Night,21,23
3,1,4,3,Night,16,36
4,1,4,4,Night,8,22
...,...,...,...,...,...,...
1579,1,263,19,Evening,350,204
1580,1,263,20,Evening,361,242
1581,1,263,21,Evening,373,248
1582,1,263,22,Night,341,211


In [18]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,1,4,0,47,55,0,0,0,1
1,1,4,1,29,60,0,0,0,1
2,1,4,2,21,23,0,0,0,1
3,1,4,3,16,36,0,0,0,1
4,1,4,4,8,22,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,1,263,19,350,204,0,1,0,0
1580,1,263,20,361,242,0,1,0,0
1581,1,263,21,373,248,0,1,0,0
1582,1,263,22,341,211,0,0,0,1


In [19]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,1,4,0,0,0,0,1,47,55
1,1,4,1,0,0,0,1,29,60
2,1,4,2,0,0,0,1,21,23
3,1,4,3,0,0,0,1,16,36
4,1,4,4,0,0,0,1,8,22
...,...,...,...,...,...,...,...,...,...
1579,1,263,19,0,1,0,0,350,204
1580,1,263,20,0,1,0,0,361,242
1581,1,263,21,0,1,0,0,373,248
1582,1,263,22,0,0,0,1,341,211


In [20]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/Thanksgiving.json', orient='records')

# Halloween Parade

## event day busyness

In [41]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-10.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-10-28")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
1963989,237,2018-10-28,10,16,Sunday
7725484,13,2018-10-28,10,0,Sunday
7726733,4,2018-10-28,10,0,Sunday
7726734,148,2018-10-28,10,0,Sunday
7727519,234,2018-10-28,10,0,Sunday
...,...,...,...,...,...
8833042,74,2018-10-28,10,19,Sunday
8833045,42,2018-10-28,10,20,Sunday
8833048,234,2018-10-28,10,20,Sunday
8833051,48,2018-10-28,10,20,Sunday


In [42]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [43]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [44]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [45]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,132
1,4,1,134
2,4,2,118
3,4,3,104
4,4,4,66
...,...,...,...
1579,263,19,324
1580,263,20,328
1581,263,21,237
1582,263,22,180


## baseline busyness

In [46]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-10.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-10-21")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-10-14")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,142,2018-10-21,10,0,Sunday
1,243,2018-10-21,10,0,Sunday
2,79,2018-10-21,10,0,Sunday
3,142,2018-10-21,10,0,Sunday
4,186,2018-10-21,10,0,Sunday
...,...,...,...,...,...
453412,127,2018-10-14,10,22,Sunday
453413,161,2018-10-14,10,22,Sunday
453414,137,2018-10-14,10,22,Sunday
453415,232,2018-10-14,10,23,Sunday


In [47]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [48]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [49]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,102
1,4,1,117
2,4,2,109
3,4,3,81
4,4,4,43
...,...,...,...
1579,263,19,352
1580,263,20,286
1581,263,21,271
1582,263,22,225


In [50]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,102,132
1,4,1,117,134
2,4,2,109,118
3,4,3,81,104
4,4,4,43,66
...,...,...,...,...
1579,263,19,352,324
1580,263,20,286,328
1581,263,21,271,237
1582,263,22,225,180


In [51]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,102,132,Night
1,4,1,117,134,Night
2,4,2,109,118,Night
3,4,3,81,104,Night
4,4,4,43,66,Night
...,...,...,...,...,...
1579,263,19,352,324,Evening
1580,263,20,286,328,Evening
1581,263,21,271,237,Evening
1582,263,22,225,180,Night


In [52]:
df_compare["Event_ID"]=2
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,2,4,0,Night,102,132
1,2,4,1,Night,117,134
2,2,4,2,Night,109,118
3,2,4,3,Night,81,104
4,2,4,4,Night,43,66
...,...,...,...,...,...,...
1579,2,263,19,Evening,352,324
1580,2,263,20,Evening,286,328
1581,2,263,21,Evening,271,237
1582,2,263,22,Night,225,180


In [53]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,2,4,0,102,132,0,0,0,1
1,2,4,1,117,134,0,0,0,1
2,2,4,2,109,118,0,0,0,1
3,2,4,3,81,104,0,0,0,1
4,2,4,4,43,66,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,2,263,19,352,324,0,1,0,0
1580,2,263,20,286,328,0,1,0,0
1581,2,263,21,271,237,0,1,0,0
1582,2,263,22,225,180,0,0,0,1


In [54]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,2,4,0,0,0,0,1,102,132
1,2,4,1,0,0,0,1,117,134
2,2,4,2,0,0,0,1,109,118
3,2,4,3,0,0,0,1,81,104
4,2,4,4,0,0,0,1,43,66
...,...,...,...,...,...,...,...,...,...
1579,2,263,19,0,1,0,0,352,324
1580,2,263,20,0,1,0,0,286,328
1581,2,263,21,0,1,0,0,271,237
1582,2,263,22,0,0,0,1,225,180


In [55]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/Halloween.json', orient='records')

# Saint Patrick's Day Parade

## event day busyness

In [56]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-03.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-03-17")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
5013309,140,2018-03-17,3,0,Saturday
5014092,249,2018-03-17,3,0,Saturday
5014093,244,2018-03-17,3,0,Saturday
5014108,151,2018-03-17,3,0,Saturday
5015217,151,2018-03-17,3,0,Saturday
...,...,...,...,...,...
5611337,41,2018-03-17,3,18,Saturday
5622584,68,2018-03-17,3,18,Saturday
5630765,234,2018-03-17,3,23,Saturday
5634406,144,2018-03-17,3,23,Saturday


In [57]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [58]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [59]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [60]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,131
1,4,1,131
2,4,2,86
3,4,3,78
4,4,4,73
...,...,...,...
1579,263,19,515
1580,263,20,461
1581,263,21,459
1582,263,22,465


## baseline busyness

In [61]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-03.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-03-10")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-03-03")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,140,2018-03-10,3,0,Saturday
1,143,2018-03-10,3,23,Saturday
2,263,2018-03-10,3,0,Saturday
3,262,2018-03-10,3,0,Saturday
4,229,2018-03-10,3,0,Saturday
...,...,...,...,...,...
584147,162,2018-03-03,3,16,Saturday
584148,90,2018-03-03,3,20,Saturday
584149,79,2018-03-03,3,21,Saturday
584150,164,2018-03-03,3,22,Saturday


In [62]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [63]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [64]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,136
1,4,1,111
2,4,2,102
3,4,3,78
4,4,4,57
...,...,...,...
1579,263,19,446
1580,263,20,383
1581,263,21,362
1582,263,22,415


In [65]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,136,131
1,4,1,111,131
2,4,2,102,86
3,4,3,78,78
4,4,4,57,73
...,...,...,...,...
1579,263,19,446,515
1580,263,20,383,461
1581,263,21,362,459
1582,263,22,415,465


In [66]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,136,131,Night
1,4,1,111,131,Night
2,4,2,102,86,Night
3,4,3,78,78,Night
4,4,4,57,73,Night
...,...,...,...,...,...
1579,263,19,446,515,Evening
1580,263,20,383,461,Evening
1581,263,21,362,459,Evening
1582,263,22,415,465,Night


In [67]:
df_compare["Event_ID"]=3
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,3,4,0,Night,136,131
1,3,4,1,Night,111,131
2,3,4,2,Night,102,86
3,3,4,3,Night,78,78
4,3,4,4,Night,57,73
...,...,...,...,...,...,...
1579,3,263,19,Evening,446,515
1580,3,263,20,Evening,383,461
1581,3,263,21,Evening,362,459
1582,3,263,22,Night,415,465


In [68]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,3,4,0,136,131,0,0,0,1
1,3,4,1,111,131,0,0,0,1
2,3,4,2,102,86,0,0,0,1
3,3,4,3,78,78,0,0,0,1
4,3,4,4,57,73,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,3,263,19,446,515,0,1,0,0
1580,3,263,20,383,461,0,1,0,0
1581,3,263,21,362,459,0,1,0,0
1582,3,263,22,415,465,0,0,0,1


In [69]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,3,4,0,0,0,0,1,136,131
1,3,4,1,0,0,0,1,111,131
2,3,4,2,0,0,0,1,102,86
3,3,4,3,0,0,0,1,78,78
4,3,4,4,0,0,0,1,57,73
...,...,...,...,...,...,...,...,...,...
1579,3,263,19,0,1,0,0,446,515
1580,3,263,20,0,1,0,0,383,461
1581,3,263,21,0,1,0,0,362,459
1582,3,263,22,0,0,0,1,415,465


In [70]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/SaintPatricks.json', orient='records')

# Pride March

## event day busyness

In [71]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-06.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-06-24")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
6496829,239,2018-06-24,6,0,Sunday
6519484,107,2018-06-24,6,2,Sunday
6519485,140,2018-06-24,6,2,Sunday
6527202,161,2018-06-24,6,3,Sunday
6598864,90,2018-06-24,6,12,Sunday
...,...,...,...,...,...
7015439,68,2018-06-24,6,20,Sunday
7016321,41,2018-06-24,6,22,Sunday
7016322,236,2018-06-24,6,22,Sunday
7017463,244,2018-06-24,6,20,Sunday


In [72]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [73]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [74]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [75]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,115
1,4,1,95
2,4,2,107
3,4,3,73
4,4,4,56
...,...,...,...
1579,263,19,330
1580,263,20,289
1581,263,21,271
1582,263,22,235


## baseline busyness

In [76]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-06.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-06-17")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-06-10")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,68,2018-06-17,6,4,Sunday
1,141,2018-06-17,6,9,Sunday
2,170,2018-06-17,6,9,Sunday
3,74,2018-06-17,6,14,Sunday
4,161,2018-06-17,6,9,Sunday
...,...,...,...,...,...
424139,68,2018-06-10,6,16,Sunday
424140,229,2018-06-10,6,17,Sunday
424141,164,2018-06-10,6,16,Sunday
424142,158,2018-06-10,6,16,Sunday


In [77]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [78]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [79]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,108
1,4,1,106
2,4,2,82
3,4,3,63
4,4,4,36
...,...,...,...
1579,263,19,309
1580,263,20,269
1581,263,21,273
1582,263,22,218


In [80]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,108,115
1,4,1,106,95
2,4,2,82,107
3,4,3,63,73
4,4,4,36,56
...,...,...,...,...
1579,263,19,309,330
1580,263,20,269,289
1581,263,21,273,271
1582,263,22,218,235


In [81]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,108,115,Night
1,4,1,106,95,Night
2,4,2,82,107,Night
3,4,3,63,73,Night
4,4,4,36,56,Night
...,...,...,...,...,...
1579,263,19,309,330,Evening
1580,263,20,269,289,Evening
1581,263,21,273,271,Evening
1582,263,22,218,235,Night


In [82]:
df_compare["Event_ID"]=4
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,4,4,0,Night,108,115
1,4,4,1,Night,106,95
2,4,4,2,Night,82,107
3,4,4,3,Night,63,73
4,4,4,4,Night,36,56
...,...,...,...,...,...,...
1579,4,263,19,Evening,309,330
1580,4,263,20,Evening,269,289
1581,4,263,21,Evening,273,271
1582,4,263,22,Night,218,235


In [83]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,4,4,0,108,115,0,0,0,1
1,4,4,1,106,95,0,0,0,1
2,4,4,2,82,107,0,0,0,1
3,4,4,3,63,73,0,0,0,1
4,4,4,4,36,56,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,4,263,19,309,330,0,1,0,0
1580,4,263,20,269,289,0,1,0,0
1581,4,263,21,273,271,0,1,0,0
1582,4,263,22,218,235,0,0,0,1


In [84]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,4,4,0,0,0,0,1,108,115
1,4,4,1,0,0,0,1,106,95
2,4,4,2,0,0,0,1,82,107
3,4,4,3,0,0,0,1,63,73
4,4,4,4,0,0,0,1,36,56
...,...,...,...,...,...,...,...,...,...
1579,4,263,19,0,1,0,0,309,330
1580,4,263,20,0,1,0,0,269,289
1581,4,263,21,0,1,0,0,273,271
1582,4,263,22,0,0,0,1,218,235


In [85]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/PrideMarch.json', orient='records')

# TimesSquare Ball

## event day busyness

In [86]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-12.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-12-31")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
4098378,234,2018-12-31,12,16,Monday
7956719,75,2018-12-31,12,0,Monday
7957133,88,2018-12-31,12,0,Monday
7957319,229,2018-12-31,12,0,Monday
7957320,141,2018-12-31,12,0,Monday
...,...,...,...,...,...
8195636,231,2018-12-31,12,22,Monday
8195657,100,2018-12-31,12,22,Monday
8195664,244,2018-12-31,12,23,Monday
8195669,107,2018-12-31,12,23,Monday


In [87]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [88]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [89]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [90]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,47
1,4,1,37
2,4,2,20
3,4,3,15
4,4,4,13
...,...,...,...
1579,263,19,434
1580,263,20,358
1581,263,21,393
1582,263,22,278


## baseline busyness

In [91]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-12.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-12-24")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-12-17")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,162,2018-12-24,12,17,Monday
1,79,2018-12-24,12,17,Monday
2,79,2018-12-24,12,17,Monday
3,249,2018-12-24,12,17,Monday
4,230,2018-12-24,12,17,Monday
...,...,...,...,...,...
408869,42,2018-12-17,12,20,Monday
408870,232,2018-12-17,12,20,Monday
408871,90,2018-12-17,12,21,Monday
408872,232,2018-12-17,12,21,Monday


In [92]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [93]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [94]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,32
1,4,1,27
2,4,2,17
3,4,3,8
4,4,4,8
...,...,...,...
1579,263,19,350
1580,263,20,343
1581,263,21,285
1582,263,22,263


In [95]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,32,47
1,4,1,27,37
2,4,2,17,20
3,4,3,8,15
4,4,4,8,13
...,...,...,...,...
1579,263,19,350,434
1580,263,20,343,358
1581,263,21,285,393
1582,263,22,263,278


In [96]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,32,47,Night
1,4,1,27,37,Night
2,4,2,17,20,Night
3,4,3,8,15,Night
4,4,4,8,13,Night
...,...,...,...,...,...
1579,263,19,350,434,Evening
1580,263,20,343,358,Evening
1581,263,21,285,393,Evening
1582,263,22,263,278,Night


In [97]:
df_compare["Event_ID"]=5
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,5,4,0,Night,32,47
1,5,4,1,Night,27,37
2,5,4,2,Night,17,20
3,5,4,3,Night,8,15
4,5,4,4,Night,8,13
...,...,...,...,...,...,...
1579,5,263,19,Evening,350,434
1580,5,263,20,Evening,343,358
1581,5,263,21,Evening,285,393
1582,5,263,22,Night,263,278


In [98]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,5,4,0,32,47,0,0,0,1
1,5,4,1,27,37,0,0,0,1
2,5,4,2,17,20,0,0,0,1
3,5,4,3,8,15,0,0,0,1
4,5,4,4,8,13,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,5,263,19,350,434,0,1,0,0
1580,5,263,20,343,358,0,1,0,0
1581,5,263,21,285,393,0,1,0,0
1582,5,263,22,263,278,0,0,0,1


In [99]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,5,4,0,0,0,0,1,32,47
1,5,4,1,0,0,0,1,27,37
2,5,4,2,0,0,0,1,17,20
3,5,4,3,0,0,0,1,8,15
4,5,4,4,0,0,0,1,8,13
...,...,...,...,...,...,...,...,...,...
1579,5,263,19,0,1,0,0,350,434
1580,5,263,20,0,1,0,0,343,358
1581,5,263,21,0,1,0,0,285,393
1582,5,263,22,0,0,0,1,263,278


In [100]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/TimesSquare.json', orient='records')

# Lunar New Year Parade

## event day busyness

In [101]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-02.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-02-25")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
7307338,87,2018-02-25,2,0,Sunday
7310427,237,2018-02-25,2,0,Sunday
7310835,90,2018-02-25,2,0,Sunday
7310923,41,2018-02-25,2,0,Sunday
7310969,79,2018-02-25,2,0,Sunday
...,...,...,...,...,...
7601755,144,2018-02-25,2,23,Sunday
7601761,163,2018-02-25,2,23,Sunday
7602418,158,2018-02-25,2,20,Sunday
7603926,230,2018-02-25,2,23,Sunday


In [102]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [103]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [104]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [105]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,143
1,4,1,137
2,4,2,116
3,4,3,110
4,4,4,86
...,...,...,...
1579,263,19,331
1580,263,20,266
1581,263,21,173
1582,263,22,189


## baseline busyness

In [106]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-02.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-02-18")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-02-11")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,142,2018-02-18,2,0,Sunday
1,13,2018-02-18,2,23,Sunday
2,151,2018-02-18,2,0,Sunday
3,158,2018-02-18,2,0,Sunday
4,158,2018-02-18,2,0,Sunday
...,...,...,...,...,...
451509,163,2018-02-11,2,23,Sunday
451510,239,2018-02-11,2,23,Sunday
451511,161,2018-02-11,2,23,Sunday
451512,243,2018-02-11,2,23,Sunday


In [107]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [108]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [109]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,133
1,4,1,125
2,4,2,110
3,4,3,85
4,4,4,50
...,...,...,...
1579,263,19,271
1580,263,20,235
1581,263,21,236
1582,263,22,182


In [110]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,133,143
1,4,1,125,137
2,4,2,110,116
3,4,3,85,110
4,4,4,50,86
...,...,...,...,...
1579,263,19,271,331
1580,263,20,235,266
1581,263,21,236,173
1582,263,22,182,189


In [111]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,133,143,Night
1,4,1,125,137,Night
2,4,2,110,116,Night
3,4,3,85,110,Night
4,4,4,50,86,Night
...,...,...,...,...,...
1579,263,19,271,331,Evening
1580,263,20,235,266,Evening
1581,263,21,236,173,Evening
1582,263,22,182,189,Night


In [112]:
df_compare["Event_ID"]=6
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,6,4,0,Night,133,143
1,6,4,1,Night,125,137
2,6,4,2,Night,110,116
3,6,4,3,Night,85,110
4,6,4,4,Night,50,86
...,...,...,...,...,...,...
1579,6,263,19,Evening,271,331
1580,6,263,20,Evening,235,266
1581,6,263,21,Evening,236,173
1582,6,263,22,Night,182,189


In [113]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,6,4,0,133,143,0,0,0,1
1,6,4,1,125,137,0,0,0,1
2,6,4,2,110,116,0,0,0,1
3,6,4,3,85,110,0,0,0,1
4,6,4,4,50,86,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,6,263,19,271,331,0,1,0,0
1580,6,263,20,235,266,0,1,0,0
1581,6,263,21,236,173,0,1,0,0
1582,6,263,22,182,189,0,0,0,1


In [114]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,6,4,0,0,0,0,1,133,143
1,6,4,1,0,0,0,1,125,137
2,6,4,2,0,0,0,1,110,116
3,6,4,3,0,0,0,1,85,110
4,6,4,4,0,0,0,1,50,86
...,...,...,...,...,...,...,...,...,...
1579,6,263,19,0,1,0,0,271,331
1580,6,263,20,0,1,0,0,235,266
1581,6,263,21,0,1,0,0,236,173
1582,6,263,22,0,0,0,1,182,189


In [115]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/LunarNewYear.json', orient='records')

# Three Kings Day Parade

## event day busyness

In [116]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-01.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-01-05")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
859831,224,2018-01-05,1,0,Friday
859885,50,2018-01-05,1,0,Friday
860029,161,2018-01-05,1,0,Friday
860303,4,2018-01-05,1,0,Friday
860778,158,2018-01-05,1,0,Friday
...,...,...,...,...,...
1156426,107,2018-01-05,1,22,Friday
1159228,13,2018-01-05,1,23,Friday
1160958,161,2018-01-05,1,7,Friday
1160959,90,2018-01-05,1,7,Friday


In [117]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [118]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [119]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [120]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,28
1,4,1,18
2,4,2,11
3,4,3,11
4,4,4,10
...,...,...,...
1579,263,19,505
1580,263,20,430
1581,263,21,328
1582,263,22,388


## baseline busyness

In [121]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-01.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-01-12")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-01-19")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,161,2018-01-12,1,6,Friday
1,162,2018-01-12,1,6,Friday
2,162,2018-01-12,1,6,Friday
3,229,2018-01-12,1,6,Friday
4,142,2018-01-12,1,6,Friday
...,...,...,...,...,...
572367,162,2018-01-19,1,22,Friday
572368,42,2018-01-19,1,17,Friday
572369,116,2018-01-19,1,18,Friday
572370,41,2018-01-19,1,19,Friday


In [122]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [123]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [124]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,70
1,4,1,48
2,4,2,30
3,4,3,24
4,4,4,16
...,...,...,...
1579,263,19,545
1580,263,20,472
1581,263,21,400
1582,263,22,383


In [125]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,70,28
1,4,1,48,18
2,4,2,30,11
3,4,3,24,11
4,4,4,16,10
...,...,...,...,...
1579,263,19,545,505
1580,263,20,472,430
1581,263,21,400,328
1582,263,22,383,388


In [126]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,70,28,Night
1,4,1,48,18,Night
2,4,2,30,11,Night
3,4,3,24,11,Night
4,4,4,16,10,Night
...,...,...,...,...,...
1579,263,19,545,505,Evening
1580,263,20,472,430,Evening
1581,263,21,400,328,Evening
1582,263,22,383,388,Night


In [127]:
df_compare["Event_ID"]=7
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,7,4,0,Night,70,28
1,7,4,1,Night,48,18
2,7,4,2,Night,30,11
3,7,4,3,Night,24,11
4,7,4,4,Night,16,10
...,...,...,...,...,...,...
1579,7,263,19,Evening,545,505
1580,7,263,20,Evening,472,430
1581,7,263,21,Evening,400,328
1582,7,263,22,Night,383,388


In [128]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,7,4,0,70,28,0,0,0,1
1,7,4,1,48,18,0,0,0,1
2,7,4,2,30,11,0,0,0,1
3,7,4,3,24,11,0,0,0,1
4,7,4,4,16,10,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,7,263,19,545,505,0,1,0,0
1580,7,263,20,472,430,0,1,0,0
1581,7,263,21,400,328,0,1,0,0
1582,7,263,22,383,388,0,0,0,1


In [129]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,7,4,0,0,0,0,1,70,28
1,7,4,1,0,0,0,1,48,18
2,7,4,2,0,0,0,1,30,11
3,7,4,3,0,0,0,1,24,11
4,7,4,4,0,0,0,1,16,10
...,...,...,...,...,...,...,...,...,...
1579,7,263,19,0,1,0,0,545,505
1580,7,263,20,0,1,0,0,472,430
1581,7,263,21,0,1,0,0,400,328
1582,7,263,22,0,0,0,1,383,388


In [130]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/ThreeKingsDay.json', orient='records')

# Macy 4th of July Fireworks

## event day busyness

In [131]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-07.parquet")
df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-07-04")]
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
714412,163,2018-07-04,7,0,Wednesday
714461,113,2018-07-04,7,0,Wednesday
714789,163,2018-07-04,7,0,Wednesday
715120,13,2018-07-04,7,0,Wednesday
715311,141,2018-07-04,7,0,Wednesday
...,...,...,...,...,...
7850613,79,2018-07-04,7,0,Wednesday
7850615,262,2018-07-04,7,12,Wednesday
7850616,74,2018-07-04,7,13,Wednesday
7850618,233,2018-07-04,7,18,Wednesday


In [132]:
# Create an aggregated DataFrame
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [133]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [134]:
df_event = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_event= pd.concat([df_event,df],axis=0)

In [135]:
df_event.reset_index(drop=True, inplace=True)
df_event = df_event.rename(columns={'num_dropoffs':'num_dropoffs_event'})
# scaler = MinMaxScaler()
# df_event['busyness_score_event'] = scaler.fit_transform(df_event[['num_dropoffs_event']])
df_event

Unnamed: 0,DOLocationID,time,num_dropoffs_event
0,4,0,76
1,4,1,53
2,4,2,41
3,4,3,35
4,4,4,25
...,...,...,...
1579,263,19,229
1580,263,20,187
1581,263,21,165
1582,263,22,306


## baseline busyness

In [136]:
df = pd.read_parquet("C:/Users/user/Desktop/Summer Project/taxi/yellow_tripdata_2018-07.parquet")

df['date'] = df['tpep_pickup_datetime'].dt.normalize()
df['month'] = df['tpep_pickup_datetime'].dt.month
df['time'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_the_week'] = df['date'].dt.day_name()
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

df = df.drop(columns=columns_to_drop)
df = df.dropna()
#     df = df[df['PULocationID'].isin(manhattan_zones['LocationID'])]
df = df[df['DOLocationID'].isin(manhattan_zones['LocationID'])]
df1 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-07-11")]
df2 = df[(df['time'] >= 0) & (df['time'] <= 23)&(df['date']=="2018-07-18")]
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,DOLocationID,date,month,time,day_of_the_week
0,79,2018-07-11,7,3,Wednesday
1,88,2018-07-11,7,5,Wednesday
2,13,2018-07-11,7,5,Wednesday
3,163,2018-07-11,7,6,Wednesday
4,163,2018-07-11,7,6,Wednesday
...,...,...,...,...,...
513349,234,2018-07-18,7,22,Wednesday
513350,161,2018-07-18,7,21,Wednesday
513351,186,2018-07-18,7,22,Wednesday
513352,141,2018-07-18,7,23,Wednesday


In [137]:
df_agg = df.groupby(['DOLocationID', 'time']).size().reset_index(name='num_dropoffs')

separated_lists = {}

# Iterate over the data list
for index, row in df_agg.iterrows():
    category = row["DOLocationID"]
    
    # Check if the category is already a key in the dictionary
    if category in separated_lists:
        separated_lists[category].append(row)
    else:
        separated_lists[category] = [row]

dfs = {}
for category, rows in separated_lists.items():
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows)
    
    # Store the DataFrame in the dictionary with the category as the key
    dfs[category] = df

In [138]:
# Iterate over the categories in dfs
for category, df in dfs.items():
    # Create a set of existing time values in the current category
    existing_times = set(df['time'])
    
    # Check if any time value is missing from 0 to 23
    missing_times = set(range(24)) - existing_times
    
    # Add missing entries with other features set as "null"
    if missing_times:
        missing_data = [{'time': time, 'DOLocationID': category,"num_dropoffs":0} for time in missing_times]
        missing_df = pd.DataFrame(missing_data)
        df = pd.concat([df, missing_df], ignore_index=True)
        df = df.sort_values('time')
        dfs[category] = df

In [139]:
df_bl = pd.DataFrame()
for category, df in dfs.items():
    df = pd.DataFrame(df)
    df_bl= pd.concat([df_bl,df],axis=0)
df_bl.reset_index(inplace=True)
df_bl=df_bl.drop("index",axis=1)
df_bl["num_dropoffs"] =df_bl["num_dropoffs"] //2
# df_bl['busyness_score'] = scaler.fit_transform(df_bl[['num_dropoffs']])
df_bl

Unnamed: 0,DOLocationID,time,num_dropoffs
0,4,0,46
1,4,1,26
2,4,2,22
3,4,3,15
4,4,4,3
...,...,...,...
1579,263,19,458
1580,263,20,423
1581,263,21,455
1582,263,22,407


In [140]:
df_compare= pd.concat([df_bl,df_event["num_dropoffs_event"]],axis=1)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event
0,4,0,46,76
1,4,1,26,53
2,4,2,22,41
3,4,3,15,35
4,4,4,3,25
...,...,...,...,...
1579,263,19,458,229
1580,263,20,423,187
1581,263,21,455,165
1582,263,22,407,306


In [141]:
# Creating bins for the time blocks
bins = [0, 6, 12, 17, 22, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Assign the labels based on the time ranges
df_compare['TimeOfDay'] = pd.cut(df_compare['time'], bins=bins, labels=labels, right=False, include_lowest=True, ordered=False)
df_compare

Unnamed: 0,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay
0,4,0,46,76,Night
1,4,1,26,53,Night
2,4,2,22,41,Night
3,4,3,15,35,Night
4,4,4,3,25,Night
...,...,...,...,...,...
1579,263,19,458,229,Evening
1580,263,20,423,187,Evening
1581,263,21,455,165,Evening
1582,263,22,407,306,Night


In [142]:
df_compare["Event_ID"]=8
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay","num_dropoffs","num_dropoffs_event"]]
df_compare.reset_index(drop=True, inplace=True)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay,num_dropoffs,num_dropoffs_event
0,8,4,0,Night,46,76
1,8,4,1,Night,26,53
2,8,4,2,Night,22,41
3,8,4,3,Night,15,35
4,8,4,4,Night,3,25
...,...,...,...,...,...,...
1579,8,263,19,Evening,458,229
1580,8,263,20,Evening,423,187
1581,8,263,21,Evening,455,165
1582,8,263,22,Night,407,306


In [143]:
# Get dummies and concatenate with original dataframe
dummies = pd.get_dummies(df_compare['TimeOfDay'], prefix='TimeOfDay')
df_compare = pd.concat([df_compare, dummies], axis=1)
# Drop the original columns 
df_compare = df_compare.drop('TimeOfDay', axis=1)
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,num_dropoffs,num_dropoffs_event,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night
0,8,4,0,46,76,0,0,0,1
1,8,4,1,26,53,0,0,0,1
2,8,4,2,22,41,0,0,0,1
3,8,4,3,15,35,0,0,0,1
4,8,4,4,3,25,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1579,8,263,19,458,229,0,1,0,0
1580,8,263,20,423,187,0,1,0,0
1581,8,263,21,455,165,0,1,0,0
1582,8,263,22,407,306,0,0,0,1


In [144]:
df_compare=df_compare[["Event_ID","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df_compare

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,8,4,0,0,0,0,1,46,76
1,8,4,1,0,0,0,1,26,53
2,8,4,2,0,0,0,1,22,41
3,8,4,3,0,0,0,1,15,35
4,8,4,4,0,0,0,1,3,25
...,...,...,...,...,...,...,...,...,...
1579,8,263,19,0,1,0,0,458,229
1580,8,263,20,0,1,0,0,423,187
1581,8,263,21,0,1,0,0,455,165
1582,8,263,22,0,0,0,1,407,306


In [145]:
df_compare.to_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/4thOfJulyFireworks.json', orient='records')

# Models Training and Testing

all data from each events will be concatenated, and one-hot-encoding is used on feature <b>event_id</b>. The reason is so one master model is enough to predict the busyness of all 8 events.

In [147]:
df1=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/Thanksgiving.json',orient='records')
df2=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/Halloween.json',orient='records')
df3=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/SaintPatricks.json',orient='records')
df4=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/PrideMarch.json',orient='records')
df5=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/TimesSquare.json',orient='records')
df6=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/LunarNewYear.json',orient='records')
df7=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/ThreeKingsDay.json',orient='records')
df8=pd.read_json('C:/Users/user/Desktop/Summer Project/EventPrediction/Data/4thOfJulyFireworks.json',orient='records')

In [148]:
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8])
df

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,1,4,0,0,0,0,1,47,55
1,1,4,1,0,0,0,1,29,60
2,1,4,2,0,0,0,1,21,23
3,1,4,3,0,0,0,1,16,36
4,1,4,4,0,0,0,1,8,22
...,...,...,...,...,...,...,...,...,...
1579,8,263,19,0,1,0,0,458,229
1580,8,263,20,0,1,0,0,423,187
1581,8,263,21,0,1,0,0,455,165
1582,8,263,22,0,0,0,1,407,306


In [150]:
dummies = pd.get_dummies(df['Event_ID'], prefix='Event_ID')
df = pd.concat([df, dummies], axis=1)
df

Unnamed: 0,Event_ID,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event,Event_ID_1,Event_ID_2,Event_ID_3,Event_ID_4,Event_ID_5,Event_ID_6,Event_ID_7,Event_ID_8
0,1,4,0,0,0,0,1,47,55,1,0,0,0,0,0,0,0
1,1,4,1,0,0,0,1,29,60,1,0,0,0,0,0,0,0
2,1,4,2,0,0,0,1,21,23,1,0,0,0,0,0,0,0
3,1,4,3,0,0,0,1,16,36,1,0,0,0,0,0,0,0
4,1,4,4,0,0,0,1,8,22,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1579,8,263,19,0,1,0,0,458,229,0,0,0,0,0,0,0,1
1580,8,263,20,0,1,0,0,423,187,0,0,0,0,0,0,0,1
1581,8,263,21,0,1,0,0,455,165,0,0,0,0,0,0,0,1
1582,8,263,22,0,0,0,1,407,306,0,0,0,0,0,0,0,1


In [151]:
df = df.drop('Event_ID', axis=1)
df=df[["Event_ID_1","Event_ID_2","Event_ID_3","Event_ID_4","Event_ID_5","Event_ID_6","Event_ID_7","Event_ID_8","DOLocationID","time","TimeOfDay_Afternoon","TimeOfDay_Evening","TimeOfDay_Morning","TimeOfDay_Night","num_dropoffs","num_dropoffs_event"]]
df

Unnamed: 0,Event_ID_1,Event_ID_2,Event_ID_3,Event_ID_4,Event_ID_5,Event_ID_6,Event_ID_7,Event_ID_8,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,1,0,0,0,0,0,0,0,4,0,0,0,0,1,47,55
1,1,0,0,0,0,0,0,0,4,1,0,0,0,1,29,60
2,1,0,0,0,0,0,0,0,4,2,0,0,0,1,21,23
3,1,0,0,0,0,0,0,0,4,3,0,0,0,1,16,36
4,1,0,0,0,0,0,0,0,4,4,0,0,0,1,8,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1579,0,0,0,0,0,0,0,1,263,19,0,1,0,0,458,229
1580,0,0,0,0,0,0,0,1,263,20,0,1,0,0,423,187
1581,0,0,0,0,0,0,0,1,263,21,0,1,0,0,455,165
1582,0,0,0,0,0,0,0,1,263,22,0,0,0,1,407,306


In [152]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Event_ID_1,Event_ID_2,Event_ID_3,Event_ID_4,Event_ID_5,Event_ID_6,Event_ID_7,Event_ID_8,DOLocationID,time,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,num_dropoffs,num_dropoffs_event
0,1,0,0,0,0,0,0,0,4,0,0,0,0,1,47,55
1,1,0,0,0,0,0,0,0,4,1,0,0,0,1,29,60
2,1,0,0,0,0,0,0,0,4,2,0,0,0,1,21,23
3,1,0,0,0,0,0,0,0,4,3,0,0,0,1,16,36
4,1,0,0,0,0,0,0,0,4,4,0,0,0,1,8,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12667,0,0,0,0,0,0,0,1,263,19,0,1,0,0,458,229
12668,0,0,0,0,0,0,0,1,263,20,0,1,0,0,423,187
12669,0,0,0,0,0,0,0,1,263,21,0,1,0,0,455,165
12670,0,0,0,0,0,0,0,1,263,22,0,0,0,1,407,306


In [153]:
X = df.drop('num_dropoffs_event', axis=1)
y = df['num_dropoffs_event']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [154]:
xg_reg = XGBRegressor(objective ='reg:squarederror', eval_metric ='rmse')
xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

In [155]:
print("\n==================== Test Data =======================")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, preds))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, preds))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, preds)))
print('R^2:', metrics.r2_score(y_test, preds))
print("\n=======================================================")


Mean Absolute Error: 18.563544977156017
Mean Squared Error: 941.847532864926
Root Mean Squared Error: 30.68953458208394
R^2: 0.9517755381483953



In [156]:
preds = xg_reg.predict(X_test)
true_vs_predicted_test = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': preds})
true_vs_predicted_test

Unnamed: 0,Actual Value,Predicted Value
9913,25,19.880135
11600,7,9.036773
7287,261,305.819427
11057,315,253.111008
7412,1,1.690426
...,...,...
1667,60,53.902935
9128,162,158.262054
10152,50,47.606220
12514,6,13.941690


In [157]:
with open('C:/Users/user/Desktop/Summer Project/EventPrediction/EventPrediction.pkl', 'wb') as file:
    pickle.dump(xg_reg, file)