# Spark Notebook

@roman, pablo, javier

19 May, 2024

---
# Settings

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window

In [None]:
# init spark session
spark = SparkSession.builder.appName('telecom').getOrCreate()

In [None]:
# bucket
NAME = 'javier'
BUCKET = f"s3://itam-analytics-{NAME}"
FOLDER = 'telecom'

SAVE_BUCKET = 'telecom-outputs'

---
# Data

## S1: Read Data

In [None]:
# read parquet from s3
df_telecom = spark.read.parquet(f"{BUCKET}/{FOLDER}")

# look columns
df_telecom.printSchema()

In [None]:
# see number of rows
df_telecom.count()

In [None]:
# see number of nulls in column "subregion"
df_telecom.filter(col("locality").isNull()).count()

---
# Users Demographics

## S1: Where does each user lives

In [None]:
# get the most visited postal_code per user as dataframe
df_user_location = (
    df_telecom
    .groupBy("device_id", "raw_sim_operator_name", "postal_code").count()
    .withColumn("rank", F.row_number().over(Window.partitionBy("device_id").orderBy(col("count").desc())))
    .filter(col("rank") == 1)
    .drop("rank", "count")
    )

In [None]:
# for each user get distinct rows of device_id, postal_code, raw_sim_operator_name
df_lon_lat = (
    df_telecom
    .select("device_id", "postal_code", "raw_sim_operator_name", "client_longitude", "client_latitude")
    .dropDuplicates(
        subset=["device_id", "postal_code", "raw_sim_operator_name"]
    )
)

In [None]:
# join both dataframes
df_user_location = (
    df_user_location
    .join(df_lon_lat, ["device_id", "postal_code", "raw_sim_operator_name"], "inner")
)

# show
df_user_location.show()

---
# Write

In [None]:
# save table to parquet
df_user_location.write.parquet(f"{BUCKET}/{SAVE_BUCKET}/user_location", mode="overwrite")

---
# Sandbox

In [None]:
# count number of unique localities
df_location_counts.select("locality").distinct().count()