In [3]:
from .utils.spark_session import SparkSessionManager
import os
import yaml
import logging
from datetime import datetime
from utils.spark_session import SparkSessionManager
from src.ingest import RawDataLoader
from src.clean import DataCleaner
from src.aggregate import DataAggregator
from pyspark.sql.functions import col, count, avg, sum as spark_sum, date_trunc,\
    weekofyear, year, round as spark_round, when
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ImportError: attempted relative import with no known parent package

In [None]:
def load_config(config_path: str = "config/config.yaml"):
    """Read configuration from YAML"""
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

In [None]:
config = load_config()
spark = SparkSessionManager.get_spark_session(
            app_name=config['spark']['app_name'],
            memory=config['spark']['memory']
        )

In [None]:

!ls data/gold/weekly_stars

In [None]:
# Cell 2: Load Gold Data
weekly_stars = spark.read.parquet("data/gold/weekly_stars")
checkins_vs_stars = spark.read.parquet("data/gold/checkins_vs_stars")


In [None]:
print("=== DATA QUALITY SUMMARY ===")
print(f"Weekly Records: {weekly_stars.count():,}")
print(f"Businesses with Check-ins: {checkins_vs_stars.filter('total_checkins > 0').count():,}")


In [None]:
weekly_pd = weekly_stars.filter(col('review_year')==2021).limit(10000).toPandas()
plt.figure(figsize=(14, 6))
plt.plot(weekly_pd.groupby('review_week')['avg_stars_weekly'].mean())
plt.title('Average Stars by Week')
plt.xlabel('Week')
plt.ylabel('Average Stars')
plt.show()

In [None]:
checkins_vs_stars_filtered = checkins_vs_stars.filter(col('total_checkins')< 20000)
checkins_pd = checkins_vs_stars_filtered.orderBy('total_checkins', ascending=False).limit(1000).toPandas()
plt.figure(figsize=(10, 6))
# Create scatter plot
plt.scatter(checkins_pd['stars'], checkins_pd['checkins_per_review'], s=5)

# Add labels and title
plt.xlabel("stars")
plt.ylabel("total_checkins")
plt.title("Check-ins vs Star Rating")

# # Show plot
# plt.show()
# sns.scatterplot(data=checkins_pd, x='stars', y='total_checkins', alpha=0.5)
# plt.title('Check-ins vs Star Rating')
# plt.show()

In [None]:
checkins_vs_stars.show(10)

In [None]:
top_businesses = checkins_vs_stars.orderBy('total_checkins', ascending=False).limit(20)
top_businesses.show(truncate=False)

In [None]:
# Compare week 47 across all years
weekly_stars.filter(col('business_name') == "Pat O'Brienâ€™s").orderBy("review_count_weekly", ascending=False).show()

# Year-over-year comparison
# weekly_stars.filter("review_week_number = 47 AND review_year IN (2023, 2024)")

In [None]:
checkin_df = spark.read.parquet("data/silver/checkin")
business_df = spark.read.parquet("data/silver/business")
checkin_df.show(10)

In [None]:
checkin_counts = checkin_df.groupBy("business_id").agg(
            count("checkin_timestamp").alias("total_checkins")
        )
checkin_counts.show(10)

In [None]:
checkin_counts = checkin_df.groupBy("business_id").agg(
            count("checkin_timestamp").alias("total_checkins")
        )

# Join with business ratings
result = business_df.join(
    checkin_counts,
    "business_id",
    "left"
).select(
    "business_id",
    "business_name",
    "city",
    "state",
    "stars",
    "review_count",
    when(col("total_checkins").isNull(), 0)
    .otherwise(col("total_checkins")).alias("total_checkins")
).withColumn(
    "checkins_per_review",
    spark_round(col("total_checkins") / col("review_count"), 2)
).withColumn(
    "star_category",
    when(col("stars") >= 4.0, "High")
    .when(col("stars") >= 2.5, "Medium")
    .otherwise("Low")
)

result.orderBy(col("total_checkins").desc()).show(10)

In [None]:
checkins_vs_stars.orderBy('total_checkins', ascending=False).show(5)

In [None]:
spark.stop()