Practicing basic Spark-style DataFrame operations such as select, filter, groupBy,
and aggregations to understand how Spark processes structured data.

In [2]:
# Import libraries
import pandas as pd

# Sample data (simulating events data)
data = [
    ("purchase", "iPhone", "Apple", 999),
    ("purchase", "MacBook", "Apple", 1299),
    ("view", "Galaxy", "Samsung", 799),
    ("purchase", "Galaxy", "Samsung", 799),
    ("view", "Pixel", "Google", 899),
    ("purchase", "Pixel", "Google", 899),
]

events = pd.DataFrame(
    data,
    columns=["event_type", "product_name", "brand", "price"]
)

# Select columns
print(events[["event_type", "product_name", "price"]].head())

# Filter price > 100
print(events[events["price"] > 100])

# Group by event_type
print(events.groupby("event_type").count())

# Top brands by count
print(
    events.groupby("brand")
    .size()
    .sort_values(ascending=False)
    .head(5)
)

  event_type product_name  price
0   purchase       iPhone    999
1   purchase      MacBook   1299
2       view       Galaxy    799
3   purchase       Galaxy    799
4       view        Pixel    899
  event_type product_name    brand  price
0   purchase       iPhone    Apple    999
1   purchase      MacBook    Apple   1299
2       view       Galaxy  Samsung    799
3   purchase       Galaxy  Samsung    799
4       view        Pixel   Google    899
5   purchase        Pixel   Google    899
            product_name  brand  price
event_type                            
purchase               4      4      4
view                   2      2      2
brand
Apple      2
Google     2
Samsung    2
dtype: int64
