In [29]:
from io import StringIO
from dotenv import load_dotenv, dotenv_values 
import os

import boto3
import pandas as pd
load_dotenv() 

pd.set_option("display.max_columns", 50)


In [2]:
aws_access_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [3]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """Downloads a csv file from an S3 bucket
    
    Args:
        bucket (str):
            - The bucket where the files at.
        path (str):
            - The folders to the file.
        filename (str):
            - Name of the file.

    Returns:
        pd.DataFrame:
            - A DataFrame of the downloaded file.
    """
    s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key= aws_secret_key)
    
    full_path = f"{path}{filename}"
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

In [4]:
s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key= aws_secret_key)
bucket = "cubix-chicago-taxi-sm"

community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
payment_type_path = "transformed_data/payment_type/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"


In [5]:
community_areas = read_csv_from_s3(bucket, community_areas_path, "community_areas_master.csv")
company = read_csv_from_s3(bucket, company_path, "company_master.csv")
date = read_csv_from_s3(bucket, date_path, "date_dimension.csv")
payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename="payment_type_master.csv")

In [6]:
trips_list = []
weather_list = []

In [7]:
for file in s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)["Contents"]:
    taxi_trip_key = file["Key"]
    
    if taxi_trip_key.split("/")[-1].strip() != "":
        if taxi_trip_key.split(".")[1] == "csv":
            
            filename = taxi_trip_key.split("/")[-1]
            trip = read_csv_from_s3(bucket= bucket, path= taxi_trips_path, filename= filename)

            trips_list.append(trip)
            print(f"{filename} has been added.")
            
            
            

taxi_2024-10-10.csv has been added.
taxi_2024-10-11.csv has been added.
taxi_2024-10-12.csv has been added.
taxi_2024-10-13.csv has been added.
taxi_2024-10-14.csv has been added.
taxi_2024-10-15.csv has been added.
taxi_2024-10-16.csv has been added.
taxi_2024-10-17.csv has been added.
taxi_2024-10-18.csv has been added.
taxi_2024-10-19.csv has been added.
taxi_2024-10-20.csv has been added.
taxi_2024-10-21.csv has been added.
taxi_2024-10-22.csv has been added.


In [8]:
trips = pd.concat(trips_list, ignore_index=True)

In [11]:
trips.shape

(235071, 20)

In [13]:
for file in s3.list_objects(Bucket=bucket, Prefix=weather_path)["Contents"]:
    weather_key = file["Key"]
    
    if weather_key.split("/")[-1].strip() != "":
        if weather_key.split(".")[1] == "csv":
            
            filename = weather_key.split("/")[-1]
            weather_daily = read_csv_from_s3(bucket= bucket, path= weather_path, filename= filename)

            weather_list.append(weather_daily)
            print(f"{filename} has been added.")
            
           

weather_2024-10-10.csv has been added.
weather_2024-10-11.csv has been added.
weather_2024-10-12.csv has been added.
weather_2024-10-13.csv has been added.
weather_2024-10-14.csv has been added.
weather_2024-10-15.csv has been added.
weather_2024-10-16.csv has been added.
weather_2024-10-17.csv has been added.
weather_2024-10-18.csv has been added.
weather_2024-10-19.csv has been added.
weather_2024-10-20.csv has been added.
weather_2024-10-21.csv has been added.
weather_2024-10-22.csv has been added.


In [14]:
weather = pd.concat(weather_list, ignore_index=True)

In [17]:
weather.shape

(312, 5)

#### Join them together

In [90]:
trips_full = pd.merge(trips, weather, left_on="datetime_for_weather", right_on= "datetime", how= "inner")
trips_full = trips_full.drop(columns=["datetime"])

In [91]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [92]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [93]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id","area_code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)

In [94]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id","area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)

In [95]:
date['date'] = pd.to_datetime(date['date'])
trips_full["trip_start_timestamp"] = pd.to_datetime(trips_full["trip_start_timestamp"])

trips_full["trip_start_date"] = trips_full["trip_start_timestamp"].dt.date
trips_full["trip_start_date"] = pd.to_datetime(trips_full["trip_start_date"])

In [96]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on="date", how="inner")
trips_full = trips_full.drop(columns=["date"])

In [97]:
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,tempretaure,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name,trip_start_date,year,month,day,day_of_week,is_weekend
0,cf1abe18f63e9163e8c7bb1555e15dea46a6a47c,a04ded5c1365641c170cd53f6befbcf7d3e6234337d0f4...,2024-10-10 23:45:00,2024-10-11T00:00:00.000,1310,6.54,20.0,4.3,0.0,1.0,25.8,41.878866,-87.625192,41.944227,-87.655998,2024-10-10 23:00:00,18.3,8.0,0.0,0.0,Sun Taxi,Credit Card,Loop,Lake View,2024-10-10,2024,10,10,4,False
1,ce2b7180ac20b666e60a481fdefd150969445b74,396892a80daeabb209e6a8a53a4beb62d71771dcf69611...,2024-10-10 23:45:00,2024-10-11T00:00:00.000,513,3.17,35.0,7.1,0.0,0.0,42.6,41.85935,-87.617358,41.893216,-87.637844,2024-10-10 23:00:00,18.3,8.0,0.0,0.0,Sun Taxi,Credit Card,Near South Side,Near North Side,2024-10-10,2024,10,10,4,False
2,cc60ec2d55e9f141669db6dc62a9780219974fa7,ac1f0defd00d6b7417e39256ebe1cad16b64955f9a64f2...,2024-10-10 23:45:00,2024-10-10T23:45:00.000,4,0.0,40.0,0.0,0.0,0.0,40.0,41.85935,-87.617358,41.85935,-87.617358,2024-10-10 23:00:00,18.3,8.0,0.0,0.0,City Service,Cash,Near South Side,Near South Side,2024-10-10,2024,10,10,4,False
3,cc25290458e501d87c5d1fb8155c88ff4007de6e,bb90b2306d04496af0bd28b855103080054c6c38bbbe8e...,2024-10-10 23:45:00,2024-10-11T00:15:00.000,1167,12.17,31.25,7.35,0.0,5.0,44.1,41.785999,-87.750934,41.892508,-87.626215,2024-10-10 23:00:00,18.3,8.0,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card,Garfield Ridge,Near North Side,2024-10-10,2024,10,10,4,False
4,caa4f1fa506bb556ebd68b41b2ec7d6fa25d406e,2981edd199f55bdc8b5ed188de69e9f5d8116ae8d7a434...,2024-10-10 23:45:00,2024-10-11T00:00:00.000,840,11.7,29.5,0.0,0.0,0.0,29.5,41.690633,-87.570058,41.842076,-87.633973,2024-10-10 23:00:00,18.3,8.0,0.0,0.0,Taxi Affiliation Services,Unknown,South Deering,Armour Square,2024-10-10,2024,10,10,4,False
