In [1]:
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [4]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Teen_Addictions")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
# Tip to reader: use WARN for development, ERROR in prod

In [5]:
df = spark.read.csv(
    path="dataset/teen_phone_addiction_dataset.csv",
    header=True,
    mode="PERMISSIVE",
    inferSchema=True
)

In [6]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- School_Grade: string (nullable = true)
 |-- Daily_Usage_Hours: double (nullable = true)
 |-- Sleep_Hours: double (nullable = true)
 |-- Academic_Performance: integer (nullable = true)
 |-- Social_Interactions: integer (nullable = true)
 |-- Exercise_Hours: double (nullable = true)
 |-- Anxiety_Level: integer (nullable = true)
 |-- Depression_Level: integer (nullable = true)
 |-- Self_Esteem: integer (nullable = true)
 |-- Parental_Control: integer (nullable = true)
 |-- Screen_Time_Before_Bed: double (nullable = true)
 |-- Phone_Checks_Per_Day: integer (nullable = true)
 |-- Apps_Used_Daily: integer (nullable = true)
 |-- Time_on_Social_Media: double (nullable = true)
 |-- Time_on_Gaming: double (nullable = true)
 |-- Time_on_Education: double (nullable = true)
 |-- Phone_Usage_Purpose: string 

## Data Cleaning and Transformations

### Changing Column Names to `snake_case`

In [7]:
def to_snake_case(column_name):
    """
    Convert column name to snake_case (PEP8 style).
    Example: "Addiction_level" -> "addiction_level"
    """
    
    name = column_name.lower()
    return name

In [8]:
current_columns = df.columns
column_mappings = {col: to_snake_case(col) for col in current_columns}

for old_name, new_name in column_mappings.items():
    df = df.withColumnRenamed(old_name, new_name)

In [59]:
df.show(3)

+---+---------------+---+------+------------+------------+-----------------+-----------+--------------------+-------------------+--------------+-------------+----------------+-----------+----------------+----------------------+--------------------+---------------+--------------------+--------------+-----------------+-------------------+--------------------+-------------------+---------------+
| id|           name|age|gender|    location|school_grade|daily_usage_hours|sleep_hours|academic_performance|social_interactions|exercise_hours|anxiety_level|depression_level|self_esteem|parental_control|screen_time_before_bed|phone_checks_per_day|apps_used_daily|time_on_social_media|time_on_gaming|time_on_education|phone_usage_purpose|family_communication|weekend_usage_hours|addiction_level|
+---+---------------+---+------+------------+------------+-----------------+-----------+--------------------+-------------------+--------------+-------------+----------------+-----------+----------------+----

### Working With Genders

In [70]:
df.select('gender').distinct().show()

+------+
|gender|
+------+
|Female|
| Other|
|  Male|
+------+



Did not know that we also have new gender, which is called `other`. New gender!

In [74]:
df = df.filter(F.col('gender') != 'Other')

## Exploration

In [11]:
df.createOrReplaceTempView('teen_dataset')

### I. Top 5 Addicted Teens

In [15]:
spark.sql(
    """
    select id, name, age, gender, school_grade, daily_usage_hours
    from 
        (select *, 
            dense_rank() over(partition by gender order by addiction_level desc) as rnk
        from teen_dataset) t
    where rnk <= 5
    """
).show()

+---+----------------+---+------+------------+-----------------+
| id|            name|age|gender|school_grade|daily_usage_hours|
+---+----------------+---+------+------------+-----------------+
|  1| Shannon Francis| 13|Female|         9th|              4.0|
|  2| Scott Rodriguez| 17|Female|         7th|              5.5|
| 11|    Bobby Sparks| 18|Female|        12th|              4.9|
| 13|    Sarah Nguyen| 19|Female|        10th|              7.4|
| 14|Melanie Phillips| 17|Female|         9th|              8.7|
| 17|    Matthew Webb| 15|Female|         8th|              5.4|
| 19|      Amy Greene| 19|Female|         9th|              7.9|
| 29|  Tammy Gallegos| 17|Female|        10th|              6.6|
| 60|  Kelly Carrillo| 17|Female|         7th|              6.3|
| 67|   Zachary Parks| 16|Female|        10th|              7.4|
| 76|      Brenda Lee| 13|Female|         8th|              6.5|
| 77| Jennifer Levine| 17|Female|         9th|              5.8|
| 82|      Andrew Kim| 19

### II. Average Sleep vs. Addiction

In [19]:
spark.sql(
    """
    select addiction_stage, round(avg(sleep_hours), 2) as avg_sleep
    from 
        (select id, name, age, gender, school_grade, daily_usage_hours, sleep_hours,
            (case 
                when addiction_level >= 0 and addiction_level < 3 then "light"
                when addiction_level >= 3 and addiction_level < 7 then "moderate"
                when addiction_level >= 7 then "severe"
            end) as addiction_stage     
        from teen_dataset) t
    group by addiction_stage
    """
).show()

+---------------+---------+
|addiction_stage|avg_sleep|
+---------------+---------+
|          light|     7.96|
|       moderate|     7.13|
|         severe|     6.38|
+---------------+---------+



### III. Screen Time Before Bed Impact
The idea is that I am going to compare `academic_performance` of teens who have `screen_time_before_bed` > 2 hours vs. those with less.

In [30]:
spark.sql(
    """
    select 
        watch_status, 
        round(avg(academic_performance), 2) as avg_performance
    from 
        (select 
            (case 
                when screen_time_before_bed > 2 then "watcher" 
                else "non_watcher" 
            end) as watch_status, 
            id, 
            name,
            academic_performance,
            screen_time_before_bed
        from teen_dataset
        order by screen_time_before_bed desc) t
    group by watch_status
    """
).show()

+------------+---------------+
|watch_status|avg_performance|
+------------+---------------+
|     watcher|          73.91|
| non_watcher|          74.97|
+------------+---------------+



### IV. Phone Purpose Distribution

In [33]:
df.select('phone_usage_purpose').distinct().show()

+-------------------+
|phone_usage_purpose|
+-------------------+
|          Education|
|             Gaming|
|           Browsing|
|              Other|
|       Social Media|
+-------------------+



In [37]:
spark.sql(
    """
    select 
        phone_usage_purpose,
        round(avg(academic_performance)) as avg_performance,
        round(avg(self_esteem)) as avg_esteem,
        count(*) as number_of_users
    from teen_dataset
    group by phone_usage_purpose 
    """
).show()

+-------------------+---------------+----------+---------------+
|phone_usage_purpose|avg_performance|avg_esteem|number_of_users|
+-------------------+---------------+----------+---------------+
|          Education|           75.0|       6.0|            602|
|             Gaming|           76.0|       6.0|            574|
|           Browsing|           75.0|       6.0|            627|
|              Other|           74.0|       6.0|            622|
|       Social Media|           75.0|       5.0|            575|
+-------------------+---------------+----------+---------------+



### V. Weekend vs. Weekday Usage (indicating binge usage)

In [46]:
spark.sql(
    """
    select 
        name,
        weekend_usage_hours,
        daily_usage_hours
    from teen_dataset
    where weekend_usage_hours > 2 * daily_usage_hours
    """
).show()

+------------------+-------------------+-----------------+
|              name|weekend_usage_hours|daily_usage_hours|
+------------------+-------------------+-----------------+
|   Shannon Francis|                8.7|              4.0|
|      Edward Avila|                9.1|              3.0|
|       John Cooper|                7.5|              3.0|
|  Michael Williams|                6.7|              3.0|
|  Patricia Johnson|                5.7|              2.7|
|     Bailey Flores|                9.3|              4.1|
|    Cameron Hansen|                6.2|              1.7|
|     Jessica Smith|               10.4|              3.3|
|       Brandi King|                5.8|              1.0|
|  Kathleen Hoffman|                7.6|              3.4|
|  Christine Hansen|               11.0|              4.1|
|       Maria Ochoa|                9.3|              3.8|
|     Brian Coleman|                5.3|              0.7|
|  Gregory Martinez|                6.7|              3.

### VI. Top Performers with High Usage

In [53]:
spark.sql(
    """
    select 
        id, 
        name, 
        gender, 
        daily_usage_hours, 
        academic_performance
    from 
        (select *
        from teen_dataset
        where daily_usage_hours > 6) t
    where academic_performance > 95
    order by daily_usage_hours desc
    """
).show()

+----+-------------------+------+-----------------+--------------------+
|  id|               name|gender|daily_usage_hours|academic_performance|
+----+-------------------+------+-----------------+--------------------+
|2671|   Deborah Gonzalez| Other|             11.5|                  96|
| 583|       Julia Rogers|Female|             10.5|                  97|
|1550|    Thomas Mckinney| Other|             10.3|                  98|
| 978|     Alyssa Jenkins|  Male|              9.4|                  97|
| 339|        Robert Hays| Other|              9.2|                  96|
| 596|      Brian Alvarez|Female|              9.1|                  99|
|2841|      Julia Terrell| Other|              9.1|                  96|
| 409|          Ian Riley| Other|              8.7|                  97|
|2980|      Travis Harris|  Male|              8.7|                  98|
| 754|    Matthew Gardner|Female|              8.5|                  98|
| 239|      Kelly Vasquez|  Male|              8.4|

### VII. Exercise vs. Social Media

In [58]:
spark.sql(
    """
    select 
        activity_status, 
        count(*) as no_of_staudents,
        round(avg(time_on_social_media), 2) as avg_social_media
    from
        (select 
            *,
            case 
                when exercise_hours >= 0 and exercise_hours < 1 then "not_active"
                when exercise_hours >= 1 and exercise_hours < 3 then "kind_of_active"
                when exercise_hours >= 3 then "active"
            end as activity_status
        from teen_dataset) t
    group by activity_status
    """
).show()

+---------------+---------------+----------------+
|activity_status|no_of_staudents|avg_social_media|
+---------------+---------------+----------------+
| kind_of_active|           1551|            2.47|
|     not_active|           1422|            2.53|
|         active|             27|             2.5|
+---------------+---------------+----------------+



### VII. Parental Control Effectiveness

In [60]:
df.select('parental_control').distinct().show()

+----------------+
|parental_control|
+----------------+
|               1|
|               0|
+----------------+



In [67]:
spark.sql(
    """
    select 
        control_status,
        round(avg(addiction_level), 2) as avg_addiction_level
    from 
        (select 
            *,
            case 
                when parental_control = 1 then 'parental_control'
                when parental_control = 0 then 'no_parental_control'
            end as control_status
        from teen_dataset) t
    group by control_status
    """
).show()

+-------------------+-------------------+
|     control_status|avg_addiction_level|
+-------------------+-------------------+
|   parental_control|               8.88|
|no_parental_control|               8.88|
+-------------------+-------------------+



### IX. High Anxiety – High Usage

In [80]:
spark.sql(
    """
    select
        id,
        name,
        age,
        gender,
        phone_checks_per_day,
        anxiety_level,
        dense_rank() over(order by anxiety_level desc) as rnk
    from teen_dataset
    where phone_checks_per_day > 100 and anxiety_level >= 7
    """
).show()

+---+--------------------+---+------+--------------------+-------------+---+
| id|                name|age|gender|phone_checks_per_day|anxiety_level|rnk|
+---+--------------------+---+------+--------------------+-------------+---+
|297|        Shawn Moreno| 19|  Male|                 131|           10|  1|
|901|    Tiffany Mcdonald| 19|  Male|                 126|           10|  1|
|303|Mrs. Jillian Penn...| 17|Female|                 109|           10|  1|
| 35|         Amy Carroll| 16|  Male|                 123|           10|  1|
|305|   Michelle Williams| 18|Female|                 125|           10|  1|
| 49|      Tyrone Cabrera| 18|  Male|                 119|           10|  1|
|421|          Tony Smith| 14|Female|                 114|           10|  1|
| 78|        Joshua Moore| 19| Other|                 140|           10|  1|
|449|    Lindsey Sheppard| 19|  Male|                 133|           10|  1|
|122|      Heather Wilson| 14|  Male|                 119|           10|  1|

### X. Depression vs. Gaming

In [85]:
spark.sql(
    """
    select 
        gamer_status, 
        count(*) as no_of_students,
        avg(depression_level) as avg_depression_lvl
    from
        (select
            *,
            case
                when time_on_gaming > 3 then 'gaymer_detected'
                else 'not_gaymer'
            end as gamer_status 
        from teen_dataset) t
    group by gamer_status
    """
).show()

+---------------+--------------+------------------+
|   gamer_status|no_of_students|avg_depression_lvl|
+---------------+--------------+------------------+
|gaymer_detected|           190| 5.342105263157895|
|     not_gaymer|          2810| 5.468327402135231|
+---------------+--------------+------------------+



### XI. Self Esteem Ranking

In [87]:
spark.sql(
    """
    select 
        school_grade, 
        round(avg(self_esteem), 2) as avg_esteem
    from teen_dataset
    group by school_grade
    order by avg_esteem
    """
).show()

+------------+----------+
|school_grade|avg_esteem|
+------------+----------+
|        10th|      5.42|
|        12th|      5.48|
|        11th|      5.49|
|         8th|      5.58|
|         7th|      5.63|
|         9th|      5.67|
+------------+----------+

