In [33]:
# importing necessary libraries
import pandas as pd
import numpy as np

# reading cab rides data into dataframe
rides_df = pd.read_csv('dataOriginal/cab_rides.csv')

# reading weather data into dataframe
weather_df = pd.read_csv('dataOriginal/weather.csv')

In [34]:
# Converting Unix timestamp to Datetime, truncated to nearest hour
rides_df['time_stamp'] = pd.to_datetime(rides_df['time_stamp'], unit='ms').dt.floor('h')
weather_df['time_stamp'] = pd.to_datetime(rides_df['time_stamp'], unit='ms').dt.floor('h')

In [35]:
# removing Null Values from rides dataframe 
rides_df = rides_df.dropna()

# Remove product id from rides df
rides_df = rides_df.drop('product_id', axis = 1)

# Converting id from 30 characters -> 0, 1, 2, etc..
rides_df['id'] = range(len(rides_df))

In [36]:
# splitting dataframes into Uber and Lyft
uber_df = rides_df[rides_df['cab_type'] == 'Uber']
lyft_df = rides_df[rides_df['cab_type'] == 'Lyft']

In [37]:
from google.cloud import bigquery
from google.api_core.exceptions import NotFound

# Initialize BigQuery client
client = bigquery.Client.from_service_account_json("secrets/serviceKey.json")

# Define the dataset reference
project_id = "idmpproject-441123"
dataset_id = "uberFareEstimation"
dataset_ref = f"{project_id}.{dataset_id}"

# List all tables in the dataset
tablesList = []
try:
    tables = client.list_tables(dataset_ref)
    print(f"Tables in dataset {dataset_id}:")
    for table in tables:
        tablesList.append(table.table_id)
        print(table.table_id)
except NotFound:
    print(f"Dataset {dataset_id} does not exist.")


Tables in dataset uberFareEstimation:
lyft_data
uber_data
weather_data


In [38]:
rides_schema = [
    bigquery.SchemaField("distance", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("cab_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("time_stamp", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("destination", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("source", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("price", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("surge_multiplier", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("id", "INTEGER", mode="NULLABLE"),
    bigquery.SchemaField("name", "STRING", mode="NULLABLE")
]

weather_schema = [
    bigquery.SchemaField("temp", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("location", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("clouds", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("pressure", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("rain", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("time_stamp", "TIMESTAMP", mode="NULLABLE"),
    bigquery.SchemaField("humidity", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("wind", "FLOAT", mode="NULLABLE")
]

In [39]:
if 'uber_data' not in tablesList:
    table_id = "uber_data"
    table_ref = f"{project_id}.{dataset_id}.{table_id}"
    table_object = bigquery.Table(table_ref, schema=rides_schema)
    try:
        table = client.create_table(table_object)
        print(f"Created empty table {table_id} in dataset {dataset_id}.")
    except Exception as e:
        print(f"Error creating table: {e}")


if 'lyft_data' not in tablesList:
    table_id = "lyft_data"
    table_ref = f"{project_id}.{dataset_id}.{table_id}"
    table_object = bigquery.Table(table_ref, schema=rides_schema)
    try:
        table = client.create_table(table_object)
        print(f"Created empty table {table_id} in dataset {dataset_id}.")
    except Exception as e:
        print(f"Error creating table: {e}")
        
        
if 'weather_data' not in tablesList:
    table_id = "weather_data"
    table_ref = f"{project_id}.{dataset_id}.{table_id}"
    table_object = bigquery.Table(table_ref, schema=rides_schema)
    try:
        table = client.create_table(table_object)
        print(f"Created empty table {table_id} in dataset {dataset_id}.")
    except Exception as e:
        print(f"Error creating table: {e}")

