In [1]:
import polars as pl
import os
from datetime import datetime

In [2]:
RAW_DATA_PATH = '../data/raw/hub_notifications.json'
TRANSFORMED_DATA_PATH = '../data/transformed/hub_notifications_transformed.csv'

df = pl.read_json(RAW_DATA_PATH)

## Transformation 1: Drop columns

In [3]:
columns_to_drop = [
    'type', 'classification', 'role', 'created_date', 'updated_date',
    'created_by', '_rid', '_self', '_etag', '_attachments', '_ts', 'updated_by'
]

df = df.drop(columns_to_drop)

# print(f"After dropping columns: {df.shape[0]} records with {df.shape[1]} columns")
# df.head()

## Transformation 2: Explode on tile column

In [4]:
df = df.explode('tile')

# print(f"After exploding tile column: {df.shape[0]} records with {df.shape[1]} columns")
# df.head()

## Transformation 3: Add notification_type column for major and minor classification

In [5]:
# Calculate time difference in days (milliseconds to days)
df = df.with_columns(
    ((pl.col('end') - pl.col('start')) / (1000 * 60 * 60 * 24)).alias('time_diff_days')
)

# Add notification_type based on time difference
df = df.with_columns(
    pl.when(pl.col('time_diff_days') > 2)
    .then(pl.lit('major'))
    .otherwise(pl.lit('minor'))
    .alias('notification_type')
)
    
# Convert timestamps using UTC
df = df.with_columns(
    pl.col('start').map_elements(lambda x: datetime.utcfromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S')).alias('start'),
    pl.col('end').map_elements(lambda x: datetime.utcfromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S')).alias('end')
)

#df.head()



In [None]:
# Write transformed data
os.makedirs(os.path.dirname(TRANSFORMED_DATA_PATH), exist_ok=True)
df.write_csv(TRANSFORMED_DATA_PATH)
print(f'Transformed data saved to {TRANSFORMED_DATA_PATH}')