In [1]:
import pandas as pd
filepath = "gs://my-bigdata-project-ra/landing"

In [2]:
# List of data files
filename_list = [
    'yellow_tripdata_2022-01.parquet',
    'yellow_tripdata_2022-02.parquet', 
    'yellow_tripdata_2022-03.parquet',
    'yellow_tripdata_2022-04.parquet',
    'yellow_tripdata_2022-05.parquet',
    'yellow_tripdata_2022-06.parquet',
    'yellow_tripdata_2022-07.parquet',
    'yellow_tripdata_2022-08.parquet',
    'yellow_tripdata_2022-09.parquet',
    'yellow_tripdata_2022-10.parquet',
    'yellow_tripdata_2022-11.parquet',
    'yellow_tripdata_2022-12.parquet'
  ]

In [3]:
def perform_EDA(df : pd.DataFrame, filename : str):
    """
    perform_EDA(df : pd.DataFrame, filename : str)
    Accepts a dataframe and a text filename as inputs.
    Runs some basic statistics on the data and outputs to console.
    Returns nothing.
    """
    print(f"{filename} Number of records: {df.count()}" )
    print(f"{filename} Number of duplicate records: { len(df)-len(df.drop_duplicates())}" )
    print(f"{filename} Info")
    print(df.info())
    print(f"{filename} Describe")
    print(df.describe())
    print("\n" + "-"*50 + "\n")
    print(f"{filename} Columns with null values")
    print(df.columns[df.isnull().any()].tolist())
    rows_with_null_values = df.isnull().any(axis=1).sum()
    print(f"{filename} Number of Rows with null values: {rows_with_null_values}" )
    integer_column_list = df.select_dtypes(include='int64').columns
    print(f"{filename} Integer data type columns: {integer_column_list}")
    float_column_list = df.select_dtypes(include='float64').columns
    print(f"{filename} Float data type columns: {float_column_list}")
    print("\n" + "-"*50 + "\n")
    print(f"{filename} Min pickup date: {df['tpep_pickup_datetime'].min()}")
    print(f"{filename} Max pickup date: {df['tpep_pickup_datetime'].max()}")
    print("\n" + "-"*100 + "\n")

In [None]:
for filename in filename_list:
    # Read in taxi trip data
    print(f"Working on file: {filename}")
    df = pd.read_parquet(f"{filepath}/{filename}")
    perform_EDA(df, filename)
    # remove the dataframe from memory
    #del df

In [None]:
#Graph 1
import matplotlib.pyplot as plt
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
plt.hist(df['pickup_hour'], bins=24, edgecolor='black')  
plt.title('Pickup Hours Histogram')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Pickups')
plt.xticks(range(0, 24))
plt.show()


In [None]:
#Graph 2
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
sns.scatterplot(x='fare_amount', y='tip_amount', data=df)
plt.title('Fare Amount vs Tip Amount')
plt.xlabel('Fare Amount')
plt.ylabel('Tip Amount')
plt.show()

In [None]:
#Graph 3
import matplotlib.pyplot as plt
fares = df[df['fare_amount'].between(0, 120)]
data_set = fares[fares['tip_amount'].between(0, 50)]
plt.figure(figsize=(10, 6))
plt.scatter(data_set['fare_amount'], data_set['tip_amount'])
plt.title('Scatter Plot of Fare Amount vs Tip Amount')
plt.xlabel('Fare Amount')
plt.ylabel('Tip Amount')
plt.show()