In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("spittman1248/cdc-data-nutrition-physical-activity-obesity")

print(path)

/kaggle/input/cdc-data-nutrition-physical-activity-obesity


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, when, round, count
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .getOrCreate()
import matplotlib.pyplot as plt
from functools import reduce


In [29]:
df = spark.read.csv(path + "/Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv", header=True, inferSchema=True)
df.createOrReplaceTempView("my_table")
columns = df.columns
for col_name in columns:
    print(f"Distinct values in '{col_name}':")
    spark.sql(f"SELECT DISTINCT `{col_name}` FROM my_table").show(truncate=False)


Distinct values in 'YearStart':
+---------+
|YearStart|
+---------+
|2015     |
|2013     |
|2014     |
|2012     |
|2011     |
|2016     |
+---------+

Distinct values in 'YearEnd':
+-------+
|YearEnd|
+-------+
|2015   |
|2013   |
|2014   |
|2012   |
|2011   |
|2016   |
+-------+

Distinct values in 'LocationAbbr':
+------------+
|LocationAbbr|
+------------+
|AZ          |
|SC          |
|LA          |
|MN          |
|NJ          |
|DC          |
|OR          |
|VA          |
|RI          |
|KY          |
|NH          |
|MI          |
|NV          |
|ID          |
|CA          |
|CT          |
|NE          |
|MT          |
|NC          |
|VT          |
+------------+
only showing top 20 rows

Distinct values in 'LocationDesc':
+--------------------+
|LocationDesc        |
+--------------------+
|Utah                |
|Hawaii              |
|Minnesota           |
|Ohio                |
|National            |
|Arkansas            |
|Oregon              |
|Texas               |
|North 

In [14]:
df.createOrReplaceTempView("health_data")

In [15]:
spark.sql("""
SELECT LocationDesc AS State, ROUND(AVG(Data_Value), 2) AS AvgObesityRate
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have obesity'
GROUP BY LocationDesc
ORDER BY AvgObesityRate DESC

""").show()

+--------------+--------------+
|         State|AvgObesityRate|
+--------------+--------------+
| West Virginia|         35.18|
|   Mississippi|         35.11|
|      Arkansas|         34.56|
|     Louisiana|         34.39|
|       Alabama|         33.64|
|      Kentucky|          32.7|
|     Tennessee|         32.67|
|      Oklahoma|          32.4|
|Virgin Islands|         32.32|
|       Indiana|         31.84|
|  North Dakota|         31.48|
|South Carolina|         31.26|
|      Michigan|         31.24|
|        Kansas|          31.2|
|      Missouri|         31.19|
|     Wisconsin|         31.06|
|         Texas|          31.0|
|          Iowa|         30.69|
|          Ohio|         30.67|
|      Nebraska|         30.23|
+--------------+--------------+
only showing top 20 rows



In [16]:
spark.sql("""SELECT Stratification1 AS Gender, ROUND(AVG(Data_Value), 2) AS AvgObesity
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have obesity'
  AND StratificationCategory1 = 'Gender'
GROUP BY Stratification1
 """).show()

+------+----------+
|Gender|AvgObesity|
+------+----------+
|Female|     28.51|
|  Male|     28.99|
+------+----------+



In [19]:
spark.sql("""SELECT Stratification1 AS Income, ROUND(AVG(Data_Value), 2) AS AvgFruitUnderconsumption
FROM health_data
WHERE Question = 'Percent of adults who report consuming fruit less than one time daily'
  AND StratificationCategory1 = 'Income'
GROUP BY Stratification1
ORDER BY AvgFruitUnderconsumption DESC
""").show()

+------------------+------------------------+
|            Income|AvgFruitUnderconsumption|
+------------------+------------------------+
| Less than $15,000|                   47.54|
| $15,000 - $24,999|                   43.24|
| $25,000 - $34,999|                   40.97|
| Data not reported|                   40.28|
| $35,000 - $49,999|                   39.69|
| $50,000 - $74,999|                   38.16|
|$75,000 or greater|                   34.79|
+------------------+------------------------+



In [21]:
spark.sql("""SELECT Stratification1 AS Income, ROUND(AVG(Data_Value), 2) AS AvgVegUnderconsumption
FROM health_data
WHERE Question = 'Percent of adults who report consuming vegetables less than one time daily'
  AND StratificationCategory1 = 'Income'
GROUP BY Stratification1
ORDER BY AvgVegUnderconsumption DESC
""").show()

+------------------+----------------------+
|            Income|AvgVegUnderconsumption|
+------------------+----------------------+
| Less than $15,000|                 33.41|
| $15,000 - $24,999|                 28.28|
| Data not reported|                 26.38|
| $25,000 - $34,999|                 24.69|
| $35,000 - $49,999|                 22.17|
| $50,000 - $74,999|                 19.14|
|$75,000 or greater|                 15.34|
+------------------+----------------------+



In [22]:
spark.sql("""SELECT Stratification1 AS Education, ROUND(AVG(Data_Value), 2) AS AvgObesity
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have obesity'
  AND StratificationCategory1 = 'Education'
GROUP BY Stratification1
ORDER BY AvgObesity DESC
""").show()

+--------------------+----------+
|           Education|AvgObesity|
+--------------------+----------+
|Less than high sc...|     33.03|
|High school graduate|     30.94|
|Some college or t...|     30.11|
|    College graduate|     22.79|
+--------------------+----------+



In [23]:
spark.sql("""SELECT LocationDesc AS State, ROUND(MAX(Data_Value), 2) AS MaxObesityRate
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have obesity'
  AND StratificationCategory1 = 'Gender'
  AND Stratification1 = 'Male'
GROUP BY LocationDesc
ORDER BY MaxObesityRate DESC
LIMIT 1
""").show()

+-------------+--------------+
|        State|MaxObesityRate|
+-------------+--------------+
|West Virginia|          37.9|
+-------------+--------------+



In [24]:
spark.sql("""SELECT YearStart, ROUND(AVG(Data_Value), 2) AS AvgOverweight
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have an overweight classification'
GROUP BY YearStart
ORDER BY YearStart
""").show()

+---------+-------------+
|YearStart|AvgOverweight|
+---------+-------------+
|     2011|        35.07|
|     2012|        35.21|
|     2013|        34.94|
|     2014|        34.71|
|     2015|        35.04|
|     2016|        34.61|
+---------+-------------+



In [25]:
spark.sql("""SELECT Stratification1 AS AgeGroup, ROUND(AVG(Data_Value), 2) AS AvgObesity
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have obesity'
  AND StratificationCategory1 = 'Age (years)'
GROUP BY Stratification1
ORDER BY AvgObesity DESC
""").show()

+-----------+----------+
|   AgeGroup|AvgObesity|
+-----------+----------+
|    45 - 54|     33.85|
|    55 - 64|     33.56|
|    35 - 44|     32.44|
|    25 - 34|     27.16|
|65 or older|     26.76|
|    18 - 24|      16.4|
+-----------+----------+



In [26]:
spark.sql("""SELECT
  LocationDesc AS State,
  ROUND(AVG(CASE WHEN Question = 'Percent of adults who report consuming fruit less than one time daily' THEN Data_Value END), 2) AS FruitUnderconsumption,
  ROUND(AVG(CASE WHEN Question = 'Percent of adults who report consuming vegetables less than one time daily' THEN Data_Value END), 2) AS VegUnderconsumption
FROM health_data
WHERE Question IN (
  'Percent of adults who report consuming fruit less than one time daily',
  'Percent of adults who report consuming vegetables less than one time daily'
)
GROUP BY LocationDesc
ORDER BY FruitUnderconsumption DESC
""").show()

+--------------+---------------------+-------------------+
|         State|FruitUnderconsumption|VegUnderconsumption|
+--------------+---------------------+-------------------+
|   Puerto Rico|                54.09|              23.65|
|      Oklahoma|                50.52|              26.31|
|   Mississippi|                 50.3|              31.37|
|      Arkansas|                48.66|              28.62|
|     Louisiana|                47.64|              32.24|
| West Virginia|                 47.6|              25.79|
|      Kentucky|                46.64|              25.54|
|       Alabama|                45.26|              26.26|
|South Carolina|                45.15|              27.12|
|     Tennessee|                45.02|              24.85|
|      Missouri|                44.35|              25.49|
|       Indiana|                43.15|              28.06|
|          Guam|                42.96|               26.5|
|       Georgia|                 42.9|              24.1

In [27]:
spark.sql("""SELECT LocationDesc AS State, ROUND(AVG(Data_Value), 2) AS AvgObesity
FROM health_data
WHERE Question = 'Percent of adults aged 18 years and older who have obesity'
GROUP BY LocationDesc
ORDER BY AvgObesity ASC
LIMIT 5
""").show()

+--------------------+----------+
|               State|AvgObesity|
+--------------------+----------+
|            Colorado|     21.82|
|District of Columbia|     23.68|
|              Hawaii|     24.36|
|       Massachusetts|     24.46|
|          California|     25.11|
+--------------------+----------+

