In [None]:
spark

In [None]:
import io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


In [None]:
#Read File
results_path = "gs://my-bigdata-project-ra/trusted/results/"
results_df = spark.read.parquet(results_path)

sample_set = 0.01 
sample_results_df = results_df.sample(False, sample_set)
sample_results_df.show(5)
pdf_results = sample_results_df.toPandas()


In [None]:
#Visual 1
average_tip = pdf_results.groupby('pickup_dayofweek')['tip_amount'].mean().reset_index()

days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
average_tip['pickup_dayofweek'] = pd.Categorical(average_tip['pickup_dayofweek'], categories=days, ordered=True)
average_tip = average_tip.sort_values('pickup_dayofweek')

plt.figure(figsize=(12, 6))
barplot = sns.barplot(x='pickup_dayofweek', y='tip_amount', data=average_tip, color='blue')
y_max = average_tip['tip_amount'].max() + 0.5  
plt.ylim(0, y_max)

plt.title('Average Tip Amount by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average Tip Amount')

for index, bar in enumerate(barplot.patches):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', 
             ha='center', va='bottom', color='black')
plt.show()


In [None]:
# Visual 2
columns = [
    'total_amount', 'PULocationID', 'DOLocationID', 
    'passenger_count', 'trip_distance', 'airport_fee'
]
correlation_df = pdf_results[columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Selected Features')
plt.show()


In [None]:
#Visual 3
pdf_results['pickup_hour'] = pd.to_datetime(pdf_results['tpep_pickup_datetime']).dt.hour

hourly_tip = pdf_results.groupby('pickup_hour')['tip_amount'].mean().reset_index()

hours = [(str(i) + ' AM' if i < 12 else '12 PM' if i == 12 else str(i-12) + ' PM') for i in range(24)]
plt.figure(figsize=(12, 6))
plt.plot(hourly_tip['pickup_hour'], hourly_tip['tip_amount'], marker='o')
plt.xticks(hourly_tip['pickup_hour'], hours, rotation=45)

plt.title('Average Tip Amount by Pickup Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Tip Amount')
plt.grid(False)
plt.show()


In [None]:
#Visual 4
plt.figure(figsize=(12, 6))
sns.barplot(data=pdf_results, x='passenger_count', y='tip_amount', estimator=np.mean, ci=None, color='blue')
plt.title('Average Tip Amount by Passenger Count')
plt.xlabel('Passenger Count')
plt.ylabel('Average Tip Amount')
plt.xticks(rotation=90)
plt.show()
