In [1]:
from io import StringIO
import os
import pandas as pd
import boto3
pd.set_option("display.max_columns", 50)

ModuleNotFoundError: No module named 'boto3'

In [None]:
aws_access_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [None]:
def read_csv_froms3(bucket: str, path: str, filename: str) -> pd.DataFrame:
    """Downloads a csv file from an S3 bucket.
    
    Parameters
    ----------
    bucket  : str
        The bucket where the files at.
    path : str
        The folders to the file.
    filename : str
        The name of the file
    
    Returns
    -------
    pd.DataFrame
        A Dataframe of the downloaded file.
    """
    s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)
    full_path = f"{path}{filename}"
    
    
    object = s3.get_object(Bucket=bucket, Key=full_path)
    object = object["Body"].read().decode("utf-8")
    output_df = pd.read_csv(StringIO(object))
    
    return output_df

In [None]:
s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)
bucket ="cubix-chicago-taxi-szr"

community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
payment_type_path = "transformed_data/payment_type/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"

In [None]:

community_areas = read_csv_from_s3(bucket, community_areas_path,"community_areas_master.csv")
company = read_csv_from_s3(bucket, company_path,"company_master.csv")
date = read_csv_from_s3(bucket, date_path,"date_dimension.csv")
payment_type = read_csv_from_s3(bucket, payment_type_path,"payment_type_master.csv")              


In [None]:
trips_list =[]
weather_list = []

In [None]:


for file in s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)["Contents"]:
    taxi_trip_key = file["Key"]
        
    if taxi_trip_key.split("/")[-1] != "":
        if taxi_trip_key.split(".")[1] == "csv":
            
            filename = taxi_trip_key.split("/")[-1]
            trip = read_csv_froms3(bucket=bucket, taxi_trips_path, filename)

            trips_list.append(trip)
            print(f"{filename} has been added")
                

In [None]:
trips = pd.concat(trips_list, ignore_index=True)

In [None]:
for file in s3.list_objects(Bucket=bucket, Prefix=weather_path)["Contents"]:
    weather_key = file["Key"]
        
    if weather_key.split("/")[-1] != "":
        if weather_key.split(".")[1] == "csv":
                
            filename = weather_key.split("/")[-1]
            weather_daily= read_csv_froms3(bucket=bucket, weather_path, filename)

            weather_list.append(weather_daily)
            print(f"{filename} has been added")
                
           

In [None]:
weather = pd.concat(weather_list, ignore_index=True)

In [None]:
trips_full = pd.merge(trips_full, weather, left_on="datetime_for_weather", right_on="datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])

In [None]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])

In [None]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])

In [None]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)

In [None]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area_code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area_code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)

In [None]:
date["date"] = pd.to_datetime(date["date"])
trips_full["trip_start_timestamp"] = pd.to_datetime(trips_full["trip_start_timestamp"])

trips_full["trip_start_date"] = trips_full["trip_start_timestamp"].dt.date
trips_full["trip_start_date"] = pd.to_datetime(trips_full["trip_start_date"])



In [None]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on="date", how="inner")
trips_full = trips_full.drop(columns=["date"])