In [15]:
import polars as pl
import os
from datetime import datetime

In [16]:
NOTIFICATIONS_PATH = '../data/transformed/hub_notifications_transformed.csv'
TILES_PATH = '../data/raw/tiles.json'
JOINED_DATA_PATH = '../data/transformed/notifications_with_tiles.csv'

# Load transformed notifications data
df_notifications = pl.read_csv(NOTIFICATIONS_PATH)

# print(f"Loaded notifications data: {df_notifications.shape[0]} records with {df_notifications.shape[1]} columns")
# df_notifications.head()

In [17]:
# Load tiles data
df_tiles = pl.read_json(TILES_PATH)
# print(f"Loaded tiles data: {df_tiles.shape[0]} records with {df_tiles.shape[1]} columns")
# df_tiles.head()

## Transformation: Select and rename columns from tiles data

In [18]:
df_tiles_selected = df_tiles.select(
    pl.col('id').alias('tile_id'),
    pl.col('name').alias('tile_name'),
    pl.col('description').alias('tile_description'),
    pl.col('roles').alias('tile_roles'),
    pl.col('source').alias('tile_source')
)

# Explode tile_roles column
df_tiles_exploded = df_tiles_selected.explode('tile_roles')
# print(f"After exploding tile_roles: {df_tiles_exploded.shape[0]} records")
# df_tiles_exploded.head()

In [19]:
# Join the datasets
df_notifications = df_notifications.with_columns(
    pl.col('tile').alias('tile_id')
)

# Inner join
df_joined = df_notifications.join(
    df_tiles_exploded,
    on='tile_id',
    how='inner'
)

df_joined = df_joined.drop('tile')

# print(f"Joined data: {df_joined.shape[0]} records with {df_joined.shape[1]} columns")
# df_joined.head()

In [20]:
# Write transformed data
os.makedirs(os.path.dirname(JOINED_DATA_PATH), exist_ok=True)
df_joined.write_csv(JOINED_DATA_PATH)
print(f'Joined data saved to {JOINED_DATA_PATH}')

Joined data saved to ../data/transformed/notifications_with_tiles.csv
