## Final Data Creation

### Import Modules

In [1]:
#Import modules
import geographic as geo
import numpy as np
import pandas as pd

### Stations Data

In [2]:
#Read stations data
stations = pd.read_csv("../../Data/all_stations.csv")

In [3]:
#Drop "Unnamed: 0" column
stations = stations.drop("Unnamed: 0", axis = 1)

In [4]:
#Add zip code info to stations data
lat = stations["latitude"]
long = stations["longitude"]
stations = geo.addZip(stations, lat, long)

In [6]:
#Impute missing station zip codes
stations = geo.imputeStationsZip(stations, 5)

### Landmarks Data

In [8]:
#Read landmarks data
landmarks = pd.read_csv("../../Data/landmarks.csv")

In [9]:
#Add zip code info to landmarks data
lat = landmarks["LATITUDE"]
long = landmarks["LONGITUDE"]
landmarks = geo.addZip(landmarks, lat, long)

In [10]:
#Impute missing landmark zip code
landmarks = geo.imputeLandmarksZip(landmarks)

In [20]:
#Transform landmarks dataframe for merging
landmarks = landmarks.groupby("zip_code").agg(landmarks = ("zip_code", "count")).reset_index()

### Weather Data

In [47]:
#Read weather data
weather = pd.read_csv("../../Data/weather.csv", parse_dates = True)

In [48]:
#Convert time column to datetime
weather["time"] = pd.to_datetime(weather["time"])

In [49]:
#Rename weather dataframe columns
weather = weather.rename(columns = {"temperature_2m (°C)" : "temp",
                                    "relativehumidity_2m (%)" : "rel_humidity",
                                    "dewpoint_2m (°C)" : "dewpoint",
                                    "apparent_temperature (°C)" : "apparent_temp",
                                    "precipitation (mm)" : "precip",
                                    "rain (mm)" : "rain",
                                    "snowfall (cm)" : "snow",
                                    "cloudcover (%)" : "cloudcover",
                                    "windspeed_10m (km/h)" : "windspeed"})

In [51]:
#Keep only relevant columns
weather = weather.loc[ :, ["time", "temp", "rel_humidity", "dewpoint",
                           "apparent_temp", "precip", "rain",
                           "snow", "cloudcover", "windspeed"]]

### Trips Data

In [121]:
#Read trips data
trips = pd.read_csv("../../Data/trips_full.csv", low_memory = False)

In [122]:
#Convert start_time column to datetime
trips["start_time"] = pd.to_datetime(trips["start_time"])

In [123]:
#Keep only relevant columns
trips = trips.loc[ :, ["start_time", "from_station_id"]]

In [124]:
#Round down start_time to the nearest hour
trips["start_time"] = trips["start_time"].dt.floor("H")

### Merge Data

In [89]:
#Merge landmarks data with station data
sl = stations.merge(landmarks, on = "zip_code", how = "left")

In [91]:
#Fill missing landmark values with 0
sl["landmarks"] = sl["landmarks"].fillna(0)

#Convert landmarks column to integer type
sl["landmarks"] = sl["landmarks"].astype(int)

In [126]:
#Merge sl with trips data
tsl = trips.merge(sl, left_on = "from_station_id", right_on = "id", how = "inner")

In [131]:
#Merge weather data with tsl
tslw = tsl.merge(weather, left_on = "start_time", right_on = "time", how = "inner")

### Final Data

In [180]:
#Create final data structure
df = tslw.groupby(["start_time", "zip_code"]).agg(trips = ("id", "count")).reset_index()

In [181]:
#Keep only unique zip codes in sl
sl_u = sl.drop_duplicates(subset = ["zip_code"])

#Add landmarks information to df using sl_u
df = df.merge(sl_u.loc[:, ["zip_code", "landmarks"]], on = "zip_code", how = "inner")

In [188]:
#Add weather information to df
df = df.merge(weather, left_on = "start_time", right_on = "time", how = "inner")

In [192]:
#Drop time column from df
df = df.drop(columns = "time", axis = 1)

In [200]:
#One-hot encode zip codes
ohe_zip = pd.get_dummies(df["zip_code"])

#Add one-hot encoding back to df
df = pd.concat([df, ohe_zip], axis = 1)

In [204]:
#Drop original zip code column from df
df = df.drop(columns = "zip_code", axis = 1)

In [211]:
#Write final data to a csv file
df.to_csv("../../Data/final_data.csv", index = False)