# Spark Notebook

@roman, pablo, javier

19 May, 2024

---
# Settings

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window

In [None]:
# init spark session
spark = SparkSession.builder.appName('telecom').getOrCreate()

In [None]:
# bucket
NAME = 'javier'
BUCKET = f"s3://itam-analytics-{NAME}"
FOLDER = 'telecom'

SAVE_BUCKET = 'telecom-outputs'

---
# Data

## S1: Read Data

In [None]:
# read parquet from s3
df_telecom = spark.read.parquet(f"{BUCKET}/{FOLDER}")

# look columns
df_telecom.printSchema()

In [None]:
# see number of rows
df_telecom.count()

In [None]:
# see number of nulls in column "subregion"
df_telecom.filter(col("locality").isNull()).count()

---
# Users Demographics

## S1: Where does each user lives

In [None]:
# for each device_id get the postal_code with the most occurrences and get the longitude and latitude associated with that postal_code
df_user_location = (
    df_telecom
    .select("device_id", "postal_code", "raw_sim_operator_name", "client_longitude", "client_latitude")
    .withColumn("count_postal_code", F.count("postal_code").over(Window.partitionBy("device_id", "postal_code")))
    .withColumn("rank", F.rank().over(Window.partitionBy("device_id").orderBy(col("count_postal_code").desc())))
    .filter(col("rank") == 1)
)

# show
df_user_location.show()

In [None]:
# get dataframe of user and postal_code
df_user_locality = df_telecom.groupBy("device_id", "raw_sim_operator_name", "postal_code").count()

# get the most visited postal_code per user as dataframe
df_user_location = (
    df_user_locality
    .withColumn("rank", F.row_number().over(Window.partitionBy("device_id").orderBy(col("count").desc())))
    .filter(col("rank") == 1)
    .drop("rank", "count")
    )

# show
df_user_location.show(10)

---
# Write

In [None]:
# save table to parquet
df_location_counts.write.parquet(f"{BUCKET}/{FOLDER}/location_counts")

---
# Sandbox

In [None]:
# count number of unique localities
df_location_counts.select("locality").distinct().count()