In [1]:
!pip install timezonefinder

  from pkg_resources import load_entry_point
Collecting timezonefinder
  Downloading timezonefinder-6.5.8-cp38-cp38-manylinux_2_17_x86_64.manylinux_2_5_x86_64.manylinux1_x86_64.manylinux2014_x86_64.whl (51.3 MB)
[K     |████████████████████████████████| 51.3 MB 57 kB/s  eta 0:00:0101
Collecting h3>4
  Downloading h3-4.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 44.5 MB/s eta 0:00:01
Installing collected packages: h3, timezonefinder
Successfully installed h3-4.2.1 timezonefinder-6.5.8


In [11]:
import os
import sys

from logger import LoggerConfig
from pyspark.sql import SparkSession, DataFrame, functions as F
from utils import add_closest_city, agg_events_by_geo_n_period, read_events, read_geo

In [2]:
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["YARN_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"

In [3]:
# DEV Constants
GEO_DIR = "/user/solovyovyu/geo.csv"
EVENTS_DIR = "/user/solovyovyu/data/geo/events"
OUT_PATH = "/user/solovyovyu/analytics"

In [4]:
logger = LoggerConfig.get_logger("Mart Zones")

In [5]:
# Create Spark session
try:
    spark = SparkSession.builder.appName("Mart Zones").getOrCreate()
    logger.info("SparkSession successfully created.")
except Exception as e:
    logger.error(f"Error creating SparkSession: {e}", exc_info=True)
    sys.exit(1)

2025-02-17 19:13:05,342 - Mart Zones - INFO - SparkSession successfully created.


In [12]:
geo_df = read_geo(GEO_DIR, spark, logger)

2025-02-17 19:14:33,112 - Mart Zones - ERROR - Error while reading geo data: [PATH_NOT_FOUND] Path does not exist: file:/user/solovyovyu/geo.csv.


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/user/solovyovyu/geo.csv.

In [13]:
messages_df = read_events(EVENTS_DIR, spark, logger, "message")

2025-02-17 19:14:47,175 - Mart Zones - ERROR - Error while reading events: [PATH_NOT_FOUND] Path does not exist: file:/user/solovyovyu/data/geo/events/date=message/*/date=message/event_type=None.


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/user/solovyovyu/data/geo/events/date=message/*/date=message/event_type=None.

In [8]:
reaction_df = read_events("reaction", EVENTS_DIR, spark, logger)

AttributeError: 'SparkSession' object has no attribute 'error'

In [9]:
subscription_df = read_events("subscription", EVENTS_DIR, spark, logger)

2025-02-17 11:34:21,930 - Mart Zones - INFO - Events subscription are read from /user/solovyovyu/data/geo/events.


In [10]:
# Registrations
from pyspark.sql.window import Window

window = Window.partitionBy("user_id").orderBy("datetime")

user_df = messages_df \
    .withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") == 1)

In [11]:
messages_with_city = add_closest_city(messages_df, geo_df, logger)

2025-02-17 11:34:22,125 - Mart Zones - INFO - Distance data added to dataframe.
2025-02-17 11:34:22,211 - Mart Zones - INFO - City closest to event added to dataframe.


In [12]:
mart_geo_messages = agg_events_by_geo_n_period(messages_with_city, "message", logger)

In [13]:
reactions_with_city = add_closest_city(reaction_df, geo_df, logger)

2025-02-17 11:34:22,548 - Mart Zones - INFO - Distance data added to dataframe.
2025-02-17 11:34:22,617 - Mart Zones - INFO - City closest to event added to dataframe.


In [14]:
mart_geo_reactions = agg_events_by_geo_n_period(reactions_with_city, "reaction", logger)

In [15]:
subscription_with_city = add_closest_city(subscription_df, geo_df, logger)

2025-02-17 11:34:22,860 - Mart Zones - INFO - Distance data added to dataframe.
2025-02-17 11:34:22,923 - Mart Zones - INFO - City closest to event added to dataframe.


In [16]:
mart_geo_subs = agg_events_by_geo_n_period(subscription_with_city, "subscription", logger)

In [17]:
users_with_city = add_closest_city(user_df, geo_df, logger)

2025-02-17 11:34:23,131 - Mart Zones - INFO - Distance data added to dataframe.
2025-02-17 11:34:23,183 - Mart Zones - INFO - City closest to event added to dataframe.


In [18]:
mart_geo_user = agg_events_by_geo_n_period(users_with_city, "user", logger)

In [19]:
mart_geo_user.show()

                                                                                

+----+-----+----------+----+---------+----------+
|year|month|      city|week|week_user|month_user|
+----+-----+----------+----+---------+----------+
|2022|    5|   Bunbury|  18|       14|        14|
|2021|    5|  Canberra|  17|       52|        86|
|2021|    5| Newcastle|  18|       55|       114|
|2021|    5|   Ipswich|  18|        9|        22|
|2021|    5|    Darwin|  18|       35|        80|
|2021|    5|  Ballarat|  17|       22|        28|
|2022|    5| Melbourne|  18|       68|        68|
|2021|    5|Wollongong|  18|       12|        31|
|2021|    5|   Bendigo|  17|      106|       181|
|2022|    5|    Darwin|  18|       31|        31|
|2021|    5|   Bendigo|  18|       75|       181|
|2021|    5| Newcastle|  17|       59|       114|
|2021|    5| Toowoomba|  18|       11|        32|
|2021|    5|   Bunbury|  17|       51|        69|
|2021|    5|Cranbourne|  17|       45|        78|
|2021|    5|  Maitland|  17|       79|       125|
|2021|    5| Melbourne|  17|      120|       182|


In [20]:
join_on = ["year", "month", "week", "city"]

mart_geo = mart_geo_user.join(mart_geo_messages, on=join_on, how="full") \
    .join(mart_geo_reactions, on=join_on, how="full") \
    .join(mart_geo_subs, on=join_on, how="full") \
    .select(
        F.col("month"),
        F.col("week"),
        F.col("city").alias("zone_id"),
        F.col("week_message"),
        F.col("week_reaction"),
        F.col("week_subscription"),
        F.col("week_user"),
        F.col("month_message"),
        F.col("month_reaction"),
        F.col("month_subscription"),
        F.col("month_user")
    )

In [23]:
mart_geo.write.mode("overwrite").parquet(f"{OUT_PATH}/mart_geo")

                                                                                