In [4]:
!pip install azure-storage-blob
!pip install boto3


Collecting azure-storage-blob
  Downloading azure_storage_blob-12.19.1-py3-none-any.whl (394 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.5/394.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core<2.0.0,>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.30.1-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.30.1 azure-storage-blob-12.19.1 isodate-0.6.1
Collecting boto3
  Downloading boto3-1.34.97-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB

In [5]:

import json
import boto3
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import StringIO
import pandas as pd
import numpy as np
import requests


In [6]:
# URL to get the data
url = "https://data.cdc.gov/api/views/hn4x-zwk7/rows.csv?accessType=DOWNLOAD"

# Read the data from the CSV URL
df_raw = pd.read_csv(url)

# Convert DataFrame to CSV string
csv_data = df_raw.to_csv(index=False)

In [7]:
# Azure Functions
def azure_upload_blob(connect_str, container_name, blob_name, data):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    blob_client.upload_blob(data, overwrite=True)
    print(f"Uploaded to Azure Blob: {blob_name}")

def azure_download_blob(connect_str, container_name, blob_name):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    download_stream = blob_client.download_blob()
    return download_stream.readall()

In [8]:
df_raw.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Race/Ethnicity,Hispanic,RACE,RACEHIS
1,2014,2014,GU,Guam,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(13.444304, 144.793731)",OWS,OWS1,Q036,VALUE,66,Education,High school graduate,EDU,EDUHSGRAD
2,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q036,VALUE,59,Income,"$50,000 - $74,999",INC,INC5075
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q037,VALUE,59,Income,Data not reported,INC,INCNR
4,2015,2015,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 300 min...,,Value,...,,PA,PA1,Q045,VALUE,59,Income,"Less than $15,000",INC,INCLESS15


In [9]:
#remove columns
remove_columns = [
    'Data_Value_Unit',
    'Data_Value_Type',
    'Data_Value_Footnote_Symbol',
    'Data_Value_Footnote',
    'Total',
    'Age(years)',
    'Education',
    'Gender',
    'Income',
    'Race/Ethnicity',
    'DataValueTypeID',
    'Data_Value_Alt',
    'TopicID',
    'Class',
    'Datasource',
    'Sample_Size',
    'Low_Confidence_Limit',
]
df = df_raw.drop(columns=remove_columns)

In [10]:
#rename columns
df = df.rename(columns= {'LocationAbbr': 'StateAbbr', 'LocationDesc': 'State','ClassID': 'TopicID', 'StratificationCategoryId1': 'StratificationCategory1',
                           'StratificationCategory1':'StratificationCategory2', 'Stratification1': 'Stratification',
                          'StratificationID1': 'StratificationID'})

In [11]:
#drop all columns with NaN values
df = df.dropna()

In [12]:
df['Latitude'] = df['GeoLocation'].apply(lambda x: float(x.split(',')[0].strip('()')))
df['Longitude'] = df['GeoLocation'].apply(lambda x: float(x.split(',')[1].strip('()')))
df = df.drop(columns='GeoLocation')

In [16]:
df.head(10)

Unnamed: 0,YearStart,YearEnd,StateAbbr,State,Topic,Question,Data_Value,High_Confidence_Limit,TopicID,QuestionID,LocationID,StratificationCategory2,Stratification,StratificationCategory1,StratificationID,Latitude,Longitude
1,2014,2014,GU,Guam,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,29.3,33.3,OWS,Q036,66,Education,High school graduate,EDU,EDUHSGRAD,13.444304,144.793731
5,2015,2015,GU,Guam,Physical Activity - Behavior,Percent of adults who achieve at least 150 min...,27.4,38.5,PA,Q044,66,Race/Ethnicity,Hispanic,RACE,RACEHIS,13.444304,144.793731
6,2012,2012,WY,Wyoming,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,48.5,64.9,OWS,Q037,56,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA,43.235541,-108.10983
7,2012,2012,DC,District of Columbia,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,31.6,40.4,OWS,Q036,11,Education,Less than high school,EDU,EDUHS,38.890371,-77.031961
8,2015,2015,PR,Puerto Rico,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,38.1,43.8,PA,Q047,72,Income,"$25,000 - $34,999",INC,INC2535,18.220833,-66.590149
9,2011,2011,AL,Alabama,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,35.2,40.0,OWS,Q036,1,Age (years),25 - 34,AGEYR,AGEYR2534,32.840571,-86.631861
10,2015,2015,GU,Guam,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,30.5,35.6,PA,Q047,66,Education,High school graduate,EDU,EDUHSGRAD,13.444304,144.793731
11,2015,2015,RI,Rhode Island,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,40.2,47.4,OWS,Q037,44,Race/Ethnicity,Hispanic,RACE,RACEHIS,41.70828,-71.52247
13,2012,2012,WY,Wyoming,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,32.3,39.8,PA,Q047,56,Income,"Less than $15,000",INC,INCLESS15,43.235541,-108.10983
14,2020,2020,DE,Delaware,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,15.3,30.6,PA,Q047,10,Race/Ethnicity,Asian,RACE,RACEASN,39.008831,-75.577741


In [13]:
column_order = [
    'YearStart', 'YearEnd',
    'LocationID','StateAbbr', 'State', 'Latitude', 'Longitude',
    'TopicID', 'Topic','QuestionID', 'Question',
    'StratificationID', 'StratificationCategory1', 'StratificationCategory2', 'Stratification' ,
    'Data_Value','High_Confidence_Limit'
]
df = df.reindex(columns=column_order)

In [14]:
df = df.iloc[:, :-1]

In [15]:
df.head(20)

Unnamed: 0,YearStart,YearEnd,LocationID,StateAbbr,State,Latitude,Longitude,TopicID,Topic,QuestionID,Question,StratificationID,StratificationCategory1,StratificationCategory2,Stratification,Data_Value
1,2014,2014,66,GU,Guam,13.444304,144.793731,OWS,Obesity / Weight Status,Q036,Percent of adults aged 18 years and older who ...,EDUHSGRAD,EDU,Education,High school graduate,29.3
5,2015,2015,66,GU,Guam,13.444304,144.793731,PA,Physical Activity - Behavior,Q044,Percent of adults who achieve at least 150 min...,RACEHIS,RACE,Race/Ethnicity,Hispanic,27.4
6,2012,2012,56,WY,Wyoming,43.235541,-108.10983,OWS,Obesity / Weight Status,Q037,Percent of adults aged 18 years and older who ...,RACENAA,RACE,Race/Ethnicity,American Indian/Alaska Native,48.5
7,2012,2012,11,DC,District of Columbia,38.890371,-77.031961,OWS,Obesity / Weight Status,Q036,Percent of adults aged 18 years and older who ...,EDUHS,EDU,Education,Less than high school,31.6
8,2015,2015,72,PR,Puerto Rico,18.220833,-66.590149,PA,Physical Activity - Behavior,Q047,Percent of adults who engage in no leisure-tim...,INC2535,INC,Income,"$25,000 - $34,999",38.1
9,2011,2011,1,AL,Alabama,32.840571,-86.631861,OWS,Obesity / Weight Status,Q036,Percent of adults aged 18 years and older who ...,AGEYR2534,AGEYR,Age (years),25 - 34,35.2
10,2015,2015,66,GU,Guam,13.444304,144.793731,PA,Physical Activity - Behavior,Q047,Percent of adults who engage in no leisure-tim...,EDUHSGRAD,EDU,Education,High school graduate,30.5
11,2015,2015,44,RI,Rhode Island,41.70828,-71.52247,OWS,Obesity / Weight Status,Q037,Percent of adults aged 18 years and older who ...,RACEHIS,RACE,Race/Ethnicity,Hispanic,40.2
13,2012,2012,56,WY,Wyoming,43.235541,-108.10983,PA,Physical Activity - Behavior,Q047,Percent of adults who engage in no leisure-tim...,INCLESS15,INC,Income,"Less than $15,000",32.3
14,2020,2020,10,DE,Delaware,39.008831,-75.577741,PA,Physical Activity - Behavior,Q047,Percent of adults who engage in no leisure-tim...,RACEASN,RACE,Race/Ethnicity,Asian,15.3


In [22]:

# Specify the path to your JASON configuration file
config_file_path= '/content/config.json'

#Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)


CONNECTION_STRING_AZURE_STORAGE = config["ConnectionString"]
CONTAINER_AZURE="obesitybehavior"
blob_name = "obesitybehavior_data_cleaned.csv"

# Conbert DataFrame to CSV
output=StringIO()
df.to_csv(output, index=False)
data = output.getvalue()
output.close()

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get a blob client using the container name and blob name
blob_client = blob_service_client.get_blob_client(container=CONTAINER_AZURE, blob=blob_name)

# Upload the CSV data
blob_client.upload_blob(data, overwrite=True)

print(f"Uploaded {blob_name} to Azure Blob Storage in container {CONTAINER_AZURE}.")

Uploaded obesitybehavior_data_cleaned.csv to Azure Blob Storage in container obesitybehavior.
