## Import necessary libraries.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime
import requests

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Functions to read the API, parse and store json data retunred from the API.

In [None]:
def get_taxi_avail():
  url = "https://api.data.gov.sg/v1/transport/taxi-availability"

  # Send request and get response
  response = requests.get(url)

  if response.status_code != 200:
    return

  # Parse JSON data
  data = response.json()

  data_coordinates = data['features'][0]['geometry']['coordinates']
  df_coordinates = pd.DataFrame(data_coordinates)
  df_coordinates.columns = ["LONGITUDE", "LATITUDE"]

  ts = data['features'][0]['properties']['timestamp']
  dt_object = datetime.fromisoformat(ts)

  ts_date = dt_object.date()
  ts_time = dt_object.time()
  df_coordinates["DATE"] = ts_date
  df_coordinates["TIME"] = ts_time

  return df_coordinates

# taxi_avail = get_taxi_avail()

In [None]:
def get_area_meta():
  url = 'https://api.data.gov.sg/v1/environment/2-hour-weather-forecast'

  # Send request and get response
  response = requests.get(url)

  if response.status_code != 200:
    return

  # Parse JSON data
  data = response.json()
  data_areameta = data['area_metadata']
  df = pd.DataFrame(data_areameta)
  latitude_values = df['label_location'].apply(lambda x: x['latitude'])
  df['LATITUDE'] = latitude_values
  longitude_values = df['label_location'].apply(lambda x: x['longitude'])
  df['LONGITUDE'] = longitude_values
  df = df.drop(columns=['label_location'])

  return df

# get_area_meta()

In [None]:
def get_weather():
  url = 'https://api.data.gov.sg/v1/environment/2-hour-weather-forecast'

  # Send request and get response
  response = requests.get(url)

  if response.status_code != 200:
    return

  # Parse JSON data
  data = response.json()

  data_weather = data['items'][0]['forecasts']
  df = pd.DataFrame(data_weather)
  df.columns = ["AREA", "FORECAST"]

  ts = data['items'][0]['timestamp']
  df['timestamp'] = ts
  upd_ts = data['items'][0]['update_timestamp']
  df['update_timestamp'] = upd_ts
  valid_start_ts = data['items'][0]['valid_period']['start']
  df['valid_start'] = valid_start_ts
  valid_end_ts = data['items'][0]['valid_period']['end']
  df['valid_end'] = valid_end_ts

  return df

# get_weather()

## Helper function for appending and reading pickle files.

In [None]:
import os

def append_pkl(pkl_fp, to_add_df):
  try:
    df = pd.read_pickle(pkl_fp)
  except FileNotFoundError:
    df = pd.DataFrame()

  df = pd.concat([df, to_add_df], ignore_index=True)
  df.to_pickle(pkl_fp)

def read_pkl(pkl_fp):
  df = pd.read_pickle(pkl_fp)
  return df


In [None]:
!pip install haversine

Collecting haversine
  Downloading haversine-2.8.1-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.1


## Function to allocate Taxis to the nearest zone base on their coordinates.

In [None]:
import haversine as hs;
def get_zone(longitude, latitude):
  position = (latitude, longitude)

  area_metadata = [
    {
      "name": "Ang Mo Kio",
      "label_location": {
        "latitude": 1.375,
        "longitude": 103.839
      }
    },
    {
      "name": "Bedok",
      "label_location": {
        "latitude": 1.321,
        "longitude": 103.924
      }
    },
    {
      "name": "Bishan",
      "label_location": {
        "latitude": 1.350772,
        "longitude": 103.839
      }
    },
    {
      "name": "Boon Lay",
      "label_location": {
        "latitude": 1.304,
        "longitude": 103.701
      }
    },
    {
      "name": "Bukit Batok",
      "label_location": {
        "latitude": 1.353,
        "longitude": 103.754
      }
    },
    {
      "name": "Bukit Merah",
      "label_location": {
        "latitude": 1.277,
        "longitude": 103.819
      }
    },
    {
      "name": "Bukit Panjang",
      "label_location": {
        "latitude": 1.362,
        "longitude": 103.77195
      }
    },
    {
      "name": "Bukit Timah",
      "label_location": {
        "latitude": 1.325,
        "longitude": 103.791
      }
    },
    {
      "name": "Central Water Catchment",
      "label_location": {
        "latitude": 1.38,
        "longitude": 103.805
      }
    },
    {
      "name": "Changi",
      "label_location": {
        "latitude": 1.357,
        "longitude": 103.987
      }
    },
    {
      "name": "Choa Chu Kang",
      "label_location": {
        "latitude": 1.377,
        "longitude": 103.745
      }
    },
    {
      "name": "Clementi",
      "label_location": {
        "latitude": 1.315,
        "longitude": 103.76
      }
    },
    {
      "name": "City",
      "label_location": {
        "latitude": 1.292,
        "longitude": 103.844
      }
    },
    {
      "name": "Geylang",
      "label_location": {
        "latitude": 1.318,
        "longitude": 103.884
      }
    },
    {
      "name": "Hougang",
      "label_location": {
        "latitude": 1.361218,
        "longitude": 103.886
      }
    },
    {
      "name": "Jalan Bahar",
      "label_location": {
        "latitude": 1.347,
        "longitude": 103.67
      }
    },
    {
      "name": "Jurong East",
      "label_location": {
        "latitude": 1.326,
        "longitude": 103.737
      }
    },
    {
      "name": "Jurong Island",
      "label_location": {
        "latitude": 1.266,
        "longitude": 103.699
      }
    },
    {
      "name": "Jurong West",
      "label_location": {
        "latitude": 1.34039,
        "longitude": 103.705
      }
    },
    {
      "name": "Kallang",
      "label_location": {
        "latitude": 1.312,
        "longitude": 103.862
      }
    },
    {
      "name": "Lim Chu Kang",
      "label_location": {
        "latitude": 1.423,
        "longitude": 103.717332
      }
    },
    {
      "name": "Mandai",
      "label_location": {
        "latitude": 1.419,
        "longitude": 103.812
      }
    },
    {
      "name": "Marine Parade",
      "label_location": {
        "latitude": 1.297,
        "longitude": 103.891
      }
    },
    {
      "name": "Novena",
      "label_location": {
        "latitude": 1.327,
        "longitude": 103.826
      }
    },
    {
      "name": "Pasir Ris",
      "label_location": {
        "latitude": 1.37,
        "longitude": 103.948
      }
    },
    {
      "name": "Paya Lebar",
      "label_location": {
        "latitude": 1.358,
        "longitude": 103.914
      }
    },
    {
      "name": "Pioneer",
      "label_location": {
        "latitude": 1.315,
        "longitude": 103.675
      }
    },
    {
      "name": "Pulau Tekong",
      "label_location": {
        "latitude": 1.403,
        "longitude": 104.053
      }
    },
    {
      "name": "Pulau Ubin",
      "label_location": {
        "latitude": 1.404,
        "longitude": 103.96
      }
    },
    {
      "name": "Punggol",
      "label_location": {
        "latitude": 1.401,
        "longitude": 103.904
      }
    },
    {
      "name": "Queenstown",
      "label_location": {
        "latitude": 1.291,
        "longitude": 103.78576
      }
    },
    {
      "name": "Seletar",
      "label_location": {
        "latitude": 1.404,
        "longitude": 103.869
      }
    },
    {
      "name": "Sembawang",
      "label_location": {
        "latitude": 1.445,
        "longitude": 103.818495
      }
    },
    {
      "name": "Sengkang",
      "label_location": {
        "latitude": 1.384,
        "longitude": 103.891443
      }
    },
    {
      "name": "Sentosa",
      "label_location": {
        "latitude": 1.243,
        "longitude": 103.832
      }
    },
    {
      "name": "Serangoon",
      "label_location": {
        "latitude": 1.357,
        "longitude": 103.865
      }
    },
    {
      "name": "Southern Islands",
      "label_location": {
        "latitude": 1.208,
        "longitude": 103.842
      }
    },
    {
      "name": "Sungei Kadut",
      "label_location": {
        "latitude": 1.413,
        "longitude": 103.756
      }
    },
    {
      "name": "Tampines",
      "label_location": {
        "latitude": 1.345,
        "longitude": 103.944
      }
    },
    {
      "name": "Tanglin",
      "label_location": {
        "latitude": 1.308,
        "longitude": 103.813
      }
    },
    {
      "name": "Tengah",
      "label_location": {
        "latitude": 1.374,
        "longitude": 103.715
      }
    },
    {
      "name": "Toa Payoh",
      "label_location": {
        "latitude": 1.334304,
        "longitude": 103.856327
      }
    },
    {
      "name": "Tuas",
      "label_location": {
        "latitude": 1.294947,
        "longitude": 103.635
      }
    },
    {
      "name": "Western Islands",
      "label_location": {
        "latitude": 1.205926,
        "longitude": 103.746
      }
    },
    {
      "name": "Western Water Catchment",
      "label_location": {
        "latitude": 1.405,
        "longitude": 103.689
      }
    },
    {
      "name": "Woodlands",
      "label_location": {
        "latitude": 1.432,
        "longitude": 103.786528
      }
    },
    {
      "name": "Yishun",
      "label_location": {
        "latitude": 1.418,
        "longitude": 103.839
      }
    }
  ]

  shortest_dist = {"Area": "NULL", "Distance": 6000000}

  for zone in area_metadata:
    compare = (zone['label_location']['latitude'], zone['label_location']['longitude'])
    dist = hs.haversine(compare, position)
    #See if it is the shortest distance
    if dist < shortest_dist['Distance']:
      shortest_dist['Area'] = zone['name']
      shortest_dist['Distance'] = dist

  return shortest_dist['Area']

# Start of Data Cleaning.
## Removing data points that are out of our date range.
Range of our dates used is from 15/03/2024 to 04/04/2024.

In [None]:
#Read pkl files
taxi_df = read_pkl("taxi_log.pkl")
weather_df = read_pkl("weather_log_final.pkl")

# Remove rows from taxi_df
taxi_df.drop(taxi_df[((taxi_df['DATE'] == '2024-04-05') | (taxi_df['DATE'] == '2024-03-14'))].index, inplace=True)

# Remove rows from weather_df
weather_df.drop(weather_df[((weather_df['weather_date'] == '2024-04-05') | (weather_df['weather_date'] == '2024-03-14'))].index, inplace=True)


## Adding Zones to Taxi Dataframe.

In [None]:
def load_taxi_zones():
  taxi_df['Zone'] = taxi_df.apply(lambda row: get_zone(row['LONGITUDE'], row['LATITUDE']), axis=1)
  return taxi_df

#Add Locaiton Zones to dataframe
taxi_df = load_taxi_zones()

FileNotFoundError: [Errno 2] No such file or directory: 'taxi_data.pkl'

## Adding days of the week to taxi Dataframe.

In [None]:
import calendar

def add_day_of_week(df):
  df['Day_of_Week'] = df['DATE'].apply(lambda x: calendar.day_name[x.weekday()])
  return df

#Add days of the week into dataframe
taxi_df = add_day_of_week(taxi_df)
taxi_df.groupby('Day_of_Week').size()

## Cleaning of Weather Data.

In [None]:
from datetime import time

def clean_weather(df):
    # Convert the 'timestamp' column to datetime
    df['timestamp'] = pd.to_datetime(weather_df['timestamp'])
    df['valid_start'] = pd.to_datetime(weather_df['valid_start'])
    df['valid_end'] = pd.to_datetime(weather_df['valid_end'])

    # Create new columns 'valid_start_date', 'valid_start_time', 'valid_end_date' and 'valid_end_time' by splitting the 'valid_start' and 'valid_end' columns]
    df['weather_date'] = df['valid_start'].dt.date
    df['weather_start_time'] = df['valid_start'].dt.time
    df['weather_end_time'] = df['valid_end'].dt.time

    # Assuming 'column_to_remove' is the name of the column you want to remove
    df = df.drop('valid_start', axis=1)
    df = df.drop('valid_end', axis=1)
    df = df.drop('update_timestamp', axis=1)
    # Remove duplicates
    df = df.drop_duplicates()
    # Filter rows where hour is divisible by 2 and minute is 0
    df = df[df['weather_start_time'].apply(lambda x: x.hour % 2 == 0 and x.minute == 0)]

    # Fixing the weather_start_time and weather_end_time for 12:00:00
    for index, row in df.iterrows():
        if row['timestamp'].hour == 12:
            df.at[index,'weather_start_time'] = time(12, 0, 0)
            df.at[index,'weather_end_time'] = time(14, 0, 0)
    return df

weather_df = clean_weather(weather_df)
weather_df.to_csv('weather_data.csv', index=False)


Helper functions to further clean and prep the dataset.

In [None]:
#Function to check if a given time is between a start time and an end time
def is_time_between(start_time, end_time, check_time):
    # If start time is greater than end time, it means the time range crosses midnight
    if start_time > end_time:
        # If check time is not between end_time and start_time, return True
        if check_time >= start_time or check_time <= end_time:
            return True
    # Else, the time range does not cross midnight
    else:
        # If check time is between start_time and end_time, return True
        if start_time <= check_time <= end_time:
            return True
    # If none of the above conditions are met, return False
    return False

In [None]:
# Function to assign forecast to taxi_df
def assign_forecast(taxi_df, weather_df):
    # Create a new column 'FORECAST' in the taxi_df DataFrame
    if 'FORECAST' not in taxi_df.columns:
        taxi_df['FORECAST'] = np.nan

    # Iterate through each row in the taxi_df DataFrame
    for index, row in taxi_df.iterrows():

        # Check if the 'FORECAST' column is already filled
        if pd.notna(row['FORECAST']):
            continue
        # Rest of your code here

        # Find the corresponding weather data
        weather_data = weather_df[weather_df['weather_date'] == row['DATE']]

        # Iterate through each row in the weather_data DataFrame
        for _, weather_row in weather_data.iterrows():

            # Check if the timestamp is between the start and end time
            if is_time_between(weather_row['weather_start_time'], weather_row['weather_end_time'], row['TIME']) and row['AREA'] == weather_row['AREA']:
                # Assign the forecast to the 'forecast' column in the taxi_df DataFrame
                taxi_df.at[index, 'FORECAST'] = weather_row['FORECAST']
                break
        if(index % 100000 == 0):
            # Save the DataFrame to a CSV file for every 100,000 rows as a safety measure
            taxi_df.to_csv('taxi_data.csv', index=False)
    return taxi_df

taxi_df = assign_forecast(taxi_df, weather_df)
taxi_df.head()


In [None]:
#Function to fix missing values in the 'FORECAST' column
def fix_na(taxi_df):
    for index, row in taxi_df.iterrows():

        # Check if the 'FORECAST' column is already filled
        if pd.notna(row['FORECAST']):
            continue

        if is_time_between('04:00:00', '06:00:00', row['TIME']) and row['DATE'] == '2024-03-19':
            taxi_df.at[index, 'FORECAST'] = 'Partly Cloudy (Night)'

        elif is_time_between('06:00:00', '08:00:00', row['TIME']) and row['DATE'] == '2024-03-19':
            taxi_df.at[index, 'FORECAST'] = 'Partly Cloudy (Day)'

        elif is_time_between('04:00:00', '06:00:00', row['TIME']) and row['DATE'] == '2024-03-24':
            taxi_df.at[index, 'FORECAST'] = 'Partly Cloudy (Night)'

        elif is_time_between('06:00:00', '12:00:00', row['TIME']) and row['DATE'] == '2024-03-24':
            taxi_df.at[index, 'FORECAST'] = 'Partly Cloudy (Day)'

        elif is_time_between('22:00:00', '00:00:00', row['TIME']) and row['DATE'] == '2024-03-29':
            taxi_df.at[index, 'FORECAST'] = 'Partly Cloudy (Night)'

fix_na(taxi_df)
taxi_df.to_csv('taxi_data.csv', index=False)

In [None]:
# Function to consolidate taxi data
def consolidate_taxi_data(df):
    consolidated_df = df.groupby(['AREA', 'FORECAST', 'DATE', 'TIME','Day_of_Week']).size().reset_index(name='Count')
    consolidated_df.sort_values(['DATE', 'TIME'], inplace=True)
    return consolidated_df

# Use the function
consolidated_taxi_df = consolidate_taxi_data(taxi_df)
consolidated_taxi_df.to_csv('consolidated_taxi_data.csv', index=False)

# Function to consolidate taxi data by the hour
def consolidate_taxi_1hour(df):
    df['TIME'] = pd.to_datetime(df['TIME']).dt.floor('h').dt.time
    df = df.groupby(['AREA', 'FORECAST', 'DATE', 'TIME','Day_of_Week']).sum().reset_index()
    df.sort_values(['DATE', 'TIME'], inplace=True)
    return df

consolidate_taxi_1hour_df = consolidate_taxi_1hour(consolidated_taxi_df)
consolidate_taxi_1hour_df.to_csv('consolidated_taxi_1hour.csv', index=False)


## Display Taxi Location on Matplot based on the day of the week.

In [None]:
from ipywidgets import interact

import matplotlib.patches as mpatches

#Plot all Taxis using a scatter plot based on their cords, colour sperated by Zones.
def plot_data_week(day_of_week):
    if(day_of_week=='All'):
        df_day = taxi_df
    else:
        # Filter data for the selected day of the week
        df_day = taxi_df[taxi_df['Day_of_Week'] == day_of_week]

    # Plot the data
    plt.figure(figsize=(150, 75))
    plt.scatter(df_day["LONGITUDE"], df_day["LATITUDE"], c=df_day["Zone"].astype('category').cat.codes)

    # Create a legend
    zone_names = df_day["Zone"].unique()
    color_map = plt.cm.get_cmap('viridis', len(zone_names))
    legend_elements = [mpatches.Patch(color=color_map(i), label=zone) for i, zone in enumerate(zone_names)]
    plt.legend(handles=legend_elements, loc='upper right',fontsize=50, title='Zone', title_fontsize='50',markerscale=10)

    plt.title(f'Taxi Data for {day_of_week}')
    plt.xlabel('Longitude',fontsize=50)
    plt.ylabel('Latitude',fontsize=50)
    plt.title(label=f'Number of Taxis: {len(df_day)}', fontsize=50)
    plt.show()

# Create a slider for day of the week
days_of_week = ['All','Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
interact(plot_data_week, day_of_week=days_of_week)

## Display Taxi Location on Matplot based on the weather.

In [None]:
#Plot all Taxis using a scatter plot based on the weather using their cords, colour sperated by Zones.
def plot_data_weather(weather):
    if(weather=='All'):
        df_day = taxi_df
    else:
        # Filter data for the selected day of the week
        df_day = taxi_df[taxi_df['FORECAST'] == weather]

    # Plot the data
    plt.figure(figsize=(150, 75))
    plt.scatter(df_day["LONGITUDE"], df_day["LATITUDE"], c=df_day["AREA"].astype('category').cat.codes)

    # Create a legend
    zone_names = df_day["AREA"].unique()
    color_map = plt.cm.get_cmap('viridis', len(zone_names))
    legend_elements = [mpatches.Patch(color=color_map(i), label=zone) for i, zone in enumerate(zone_names)]
    plt.legend(handles=legend_elements, loc='upper right',fontsize=50, title='Area', title_fontsize='50',markerscale=10)

    plt.title(f'Taxi Data for {weather}')
    plt.xlabel('Longitude',fontsize=50)
    plt.ylabel('Latitude',fontsize=50)
    plt.title(label=f'Number of Taxis: {len(df_day)}', fontsize=50)
    plt.show()

# Create a slider for day of the week
weathers = ['All','Fair (Night)', 'Fair (Day)', 'Fair & Warm',
       'Partly Cloudy (Night)', 'Thundery Showers', 'Partly Cloudy (Day)',
       'Showers', 'Heavy Thundery Showers', 'Cloudy', 'Light Showers']
interact(plot_data_weather, weather=weathers,s=50)