In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, corr, when


In [2]:
spark = SparkSession.builder.appName("Smartphone Usage Analysis").getOrCreate()

In [3]:
file_path = "Bigdata_Project.csv"  
data = spark.read.csv(file_path, header=True, inferSchema=True)

In [6]:
data.printSchema()
data.show(5)

root
 |-- Battery Efficiency(mAh/hr): integer (nullable = true)
 |-- User ID: integer (nullable = true)
 |-- Device Model: string (nullable = true)
 |-- Operating System: string (nullable = true)
 |-- App Usage Time (min/day): integer (nullable = true)
 |-- Screen On Time (hours/day): double (nullable = true)
 |-- Battery Drain (mAh/day): integer (nullable = true)
 |-- Number of Apps Installed: integer (nullable = true)
 |-- Data Usage (MB/day): integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- User Behavior Class: integer (nullable = true)

+--------------------------+-------+--------------+----------------+------------------------+--------------------------+-----------------------+------------------------+-------------------+---+------+-------------------+
|Battery Efficiency(mAh/hr)|User ID|  Device Model|Operating System|App Usage Time (min/day)|Screen On Time (hours/day)|Battery Drain (mAh/day)|Number of Apps Installed|Data Us

In [22]:
goal1 = data.groupBy("Age", "Gender").agg(
    avg(col("App Usage Time (min/day)")).alias("avg_usage_time"),
    count("User ID").alias("user_count")
)
goal1.show()

+---+------+------------------+----------+
|Age|Gender|    avg_usage_time|user_count|
+---+------+------------------+----------+
| 38|female|             360.5|         6|
| 56|female|214.83333333333334|         6|
| 48|  Male|             335.8|         5|
| 26|  Male|249.66666666666666|         6|
| 53|  Male| 207.1818181818182|        11|
| 47|  Male|368.77777777777777|         9|
| 32|  Male|249.55555555555554|         9|
| 18|  male|             152.0|         1|
| 32|female|223.33333333333334|         9|
| 36|female|             205.6|        10|
| 25|  Male|             173.0|        10|
| 24|  Male|             314.7|        10|
| 39|female| 312.6666666666667|         6|
| 24|female|            189.25|         4|
| 37|  Male|270.85714285714283|         7|
| 45|  Male|206.77777777777777|         9|
| 33|  Male|             248.6|         5|
| 49|  Male|             382.6|         5|
| 55|female|             351.8|        10|
| 40|  Male|             313.6|        10|
+---+------

In [10]:
goal2 = data.select(
    col("Battery Efficiency(mAh/hr)").alias("battery_efficiency"),
    col("Device Model").alias("device_model"),
    col("Operating System").alias("operating_system"),
    col("Screen On Time (hours/day)").alias("screen_on_time")
).groupBy("device_model", "operating_system").agg(
    avg("battery_efficiency").alias("avg_battery_efficiency"),
    avg("screen_on_time").alias("avg_screen_on_time")
)

goal2.show()

+------------------+----------------+----------------------+------------------+
|      device_model|operating_system|avg_battery_efficiency|avg_screen_on_time|
+------------------+----------------+----------------------+------------------+
|Samsung Galaxy S21|         Android|     290.8045112781955| 5.318045112781958|
|         OnePlus 9|         Android|    301.29323308270676| 5.241353383458647|
|         iPhone 12|             iOS|    303.06164383561645|5.4308219178082195|
|    Google Pixel 5|         Android|    306.90845070422534|  5.07605633802817|
|      Xiaomi Mi 11|         Android|    305.06849315068496| 5.293150684931508|
+------------------+----------------+----------------------+------------------+



In [12]:
goal3 = data.groupBy(
    col("Number of Apps Installed").alias("number_of_apps_installed"),
    col("Age").alias("age"),
    col("Gender").alias("gender")
).agg(
    avg(col("Data Usage (MB/day)")).alias("avg_data_usage")
)

goal3.show()

+------------------------+---+------+--------------+
|number_of_apps_installed|age|gender|avg_data_usage|
+------------------------+---+------+--------------+
|                      33| 58|  Male|         369.0|
|                      54| 55|  Male|         987.0|
|                      91| 21|  Male|        2387.0|
|                      15| 39|female|         275.0|
|                      64| 56|  Male|        1053.0|
|                      79| 50|  Male|        1164.0|
|                      35| 34|  Male|         503.0|
|                      66| 53|  Male|        1459.0|
|                      74| 40|  Male|        1400.0|
|                      61| 19|female|        1246.0|
|                      12| 42|  Male|         164.0|
|                      71| 47|  Male|        1096.0|
|                      66| 23|female|        1200.0|
|                      55| 36|female|         965.0|
|                      90| 29|female|        2069.0|
|                      12| 56|  Male|         

In [14]:
goal4 = data.groupBy(
    col("Operating System").alias("operating_system"),
    col("Age").alias("age"),
    col("Gender").alias("gender")
).count()

goal4 = goal4.withColumnRenamed("count", "os_preferences")

goal4.show()

+----------------+---+------+--------------+
|operating_system|age|gender|os_preferences|
+----------------+---+------+--------------+
|         Android| 49|female|            11|
|         Android| 42|  Male|            11|
|         Android| 25|  Male|             9|
|         Android| 43|female|             9|
|         Android| 38|  Male|             2|
|             iOS| 31|female|             1|
|             iOS| 21|  Male|             2|
|         Android| 35|female|             6|
|             iOS| 34|female|             4|
|             iOS| 43|female|             1|
|             iOS| 58|female|             2|
|         Android| 45|female|             8|
|             iOS| 46|  Male|             2|
|         Android| 54|  Male|             7|
|             iOS| 26|female|             1|
|         Android| 56|female|             4|
|             iOS| 22|female|             4|
|         Android| 37|female|             9|
|             iOS| 29|  Male|             3|
|         

In [16]:
goal5 = data.groupBy(
    col("User Behavior Class").alias("app_category"),
    col("Age").alias("age")
).agg(
    count(col("User ID")).alias("total_installs"),
    avg(col("App Usage Time (min/day)")).alias("avg_app_usage_time")
).orderBy("age")

goal5.show()

+------------+---+--------------+------------------+
|app_category|age|total_installs|avg_app_usage_time|
+------------+---+--------------+------------------+
|           3| 18|             1|             201.0|
|           2| 18|             4|             125.5|
|           1| 18|             1|              75.0|
|           5| 18|             2|             489.0|
|           4| 18|             3| 383.6666666666667|
|           2| 19|             1|             155.0|
|           1| 19|             3|              52.0|
|           3| 19|             3|215.33333333333334|
|           5| 19|             1|             488.0|
|           4| 19|             4|             384.5|
|           1| 20|             2|              33.0|
|           3| 20|             5|             231.2|
|           4| 20|             3| 354.3333333333333|
|           2| 20|             3|             136.0|
|           5| 20|             4|             528.5|
|           4| 21|             5|             

In [18]:
goal6 = data.groupBy(
    col("Age").alias("age"),
    col("Gender").alias("gender")
).agg(
    avg(col("Screen On Time (hours/day)")).alias("avg_screen_on_time"),
    avg(col("Data Usage (MB/day)")).alias("avg_data_usage"),
    avg(col("Battery Efficiency(mAh/hr)")).alias("avg_battery_efficiency")
)

goal6.show()

+---+------+------------------+------------------+----------------------+
|age|gender|avg_screen_on_time|    avg_data_usage|avg_battery_efficiency|
+---+------+------------------+------------------+----------------------+
| 38|female| 6.183333333333334|            1201.0|     297.6666666666667|
| 56|female| 4.433333333333333| 790.8333333333334|     322.6666666666667|
| 48|  Male| 6.360000000000001|            1242.6|                 272.2|
| 26|  Male| 5.283333333333334|1086.8333333333333|     282.8333333333333|
| 53|  Male| 4.545454545454545| 750.3636363636364|                 304.0|
| 47|  Male| 7.366666666666666|1311.7777777777778|    265.55555555555554|
| 32|  Male| 4.355555555555556| 790.5555555555555|     288.8888888888889|
| 18|  male|               3.7|             429.0|                 268.0|
| 32|female| 4.366666666666666|             729.0|     338.6666666666667|
| 36|female|              4.46|             731.8|                 278.8|
| 25|  Male|              3.41|       