In [1]:
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [4]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Teen_Addictions")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
# Tip to reader: use WARN for development, ERROR in prod

In [5]:
df = spark.read.csv(
    path="dataset/teen_phone_addiction_dataset.csv",
    header=True,
    mode="PERMISSIVE",
    inferSchema=True
)

In [6]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- School_Grade: string (nullable = true)
 |-- Daily_Usage_Hours: double (nullable = true)
 |-- Sleep_Hours: double (nullable = true)
 |-- Academic_Performance: integer (nullable = true)
 |-- Social_Interactions: integer (nullable = true)
 |-- Exercise_Hours: double (nullable = true)
 |-- Anxiety_Level: integer (nullable = true)
 |-- Depression_Level: integer (nullable = true)
 |-- Self_Esteem: integer (nullable = true)
 |-- Parental_Control: integer (nullable = true)
 |-- Screen_Time_Before_Bed: double (nullable = true)
 |-- Phone_Checks_Per_Day: integer (nullable = true)
 |-- Apps_Used_Daily: integer (nullable = true)
 |-- Time_on_Social_Media: double (nullable = true)
 |-- Time_on_Gaming: double (nullable = true)
 |-- Time_on_Education: double (nullable = true)
 |-- Phone_Usage_Purpose: string 

## Data Cleaning and Transformations

### Changing Column Names to `snake_case`

In [7]:
def to_snake_case(column_name):
    """
    Convert column name to snake_case (PEP8 style).
    Example: "Addiction_level" -> "addiction_level"
    """
    
    name = column_name.lower()
    return name

In [8]:
current_columns = df.columns
column_mappings = {col: to_snake_case(col) for col in current_columns}

for old_name, new_name in column_mappings.items():
    df = df.withColumnRenamed(old_name, new_name)

In [10]:
df.show(1)

+---+---------------+---+------+----------+------------+-----------------+-----------+--------------------+-------------------+--------------+-------------+----------------+-----------+----------------+----------------------+--------------------+---------------+--------------------+--------------+-----------------+-------------------+--------------------+-------------------+---------------+
| id|           name|age|gender|  location|school_grade|daily_usage_hours|sleep_hours|academic_performance|social_interactions|exercise_hours|anxiety_level|depression_level|self_esteem|parental_control|screen_time_before_bed|phone_checks_per_day|apps_used_daily|time_on_social_media|time_on_gaming|time_on_education|phone_usage_purpose|family_communication|weekend_usage_hours|addiction_level|
+---+---------------+---+------+----------+------------+-----------------+-----------+--------------------+-------------------+--------------+-------------+----------------+-----------+----------------+----------

## Exploration

In [11]:
df.createOrReplaceTempView('teen_dataset')

### I. Top 5 Addicted Teens

In [15]:
spark.sql(
    """
    select id, name, age, gender, school_grade, daily_usage_hours
    from 
        (select *, 
            dense_rank() over(partition by gender order by addiction_level desc) as rnk
        from teen_dataset) t
    where rnk <= 5
    """
).show()

+---+----------------+---+------+------------+-----------------+
| id|            name|age|gender|school_grade|daily_usage_hours|
+---+----------------+---+------+------------+-----------------+
|  1| Shannon Francis| 13|Female|         9th|              4.0|
|  2| Scott Rodriguez| 17|Female|         7th|              5.5|
| 11|    Bobby Sparks| 18|Female|        12th|              4.9|
| 13|    Sarah Nguyen| 19|Female|        10th|              7.4|
| 14|Melanie Phillips| 17|Female|         9th|              8.7|
| 17|    Matthew Webb| 15|Female|         8th|              5.4|
| 19|      Amy Greene| 19|Female|         9th|              7.9|
| 29|  Tammy Gallegos| 17|Female|        10th|              6.6|
| 60|  Kelly Carrillo| 17|Female|         7th|              6.3|
| 67|   Zachary Parks| 16|Female|        10th|              7.4|
| 76|      Brenda Lee| 13|Female|         8th|              6.5|
| 77| Jennifer Levine| 17|Female|         9th|              5.8|
| 82|      Andrew Kim| 19

### II. Average Sleep vs. Addiction

In [19]:
spark.sql(
    """
    select addiction_stage, round(avg(sleep_hours), 2) as avg_sleep
    from 
        (select id, name, age, gender, school_grade, daily_usage_hours, sleep_hours,
            (case 
                when addiction_level >= 0 and addiction_level < 3 then "light"
                when addiction_level >= 3 and addiction_level < 7 then "moderate"
                when addiction_level >= 7 then "severe"
            end) as addiction_stage     
        from teen_dataset) t
    group by addiction_stage
    """
).show()

+---------------+---------+
|addiction_stage|avg_sleep|
+---------------+---------+
|          light|     7.96|
|       moderate|     7.13|
|         severe|     6.38|
+---------------+---------+

