In [33]:
import pandas as pd
import os
from typing import List, Optional, Dict
import gc

In [None]:
# !pip3 install pandas
# !pip3 install pyarrow


### Data Fetch

In [34]:
def fetch_nsduh_data(year: int) -> Optional[pd.DataFrame]:
    """
    Fetches NSDUH data for a specified year from a remote source.

    Args:
        year (int): The year for which to fetch data.

    Returns:
        Optional[pd.DataFrame]: A pandas DataFrame with the data, or None if fetching failed.
    """
    url_placeholder = "https://www.datafiles.samhsa.gov/sites/default/files/field-uploads-protected/studies/NSDUH-{year}/NSDUH-{year}-datasets/NSDUH-{year}-DS0001/NSDUH-{year}-DS0001-bundles-with-study-info/NSDUH-{year}-DS0001-bndl-data-tsv.zip"
    
    try:
        url = url_placeholder.format(year=year)
        df = pd.read_csv(url, compression='zip', sep='\t', low_memory=False)
        return df
    except Exception as e:
        print(f"An error occurred for year {year}: {e}")
        return None

In [35]:
def check_parquet_exists(years: List[int], output_dir: str) -> Dict[int, bool]:
    """
    Checks if Parquet files for the specified years already exist in the output directory.

    Args:
        years (List[int]): A list of years to check.
        output_dir (str): The directory where Parquet files are saved.

    Returns:
        Dict[int, bool]: A dictionary with years as keys and boolean values indicating 
                         whether the Parquet file for that year exists.
    """
    existence_check = {}
    
    for year in years:
        year_path = os.path.join(output_dir, f'year={year}')
        existence_check[year] = os.path.exists(year_path)
    
    return existence_check

In [36]:
def write_parquet(df: pd.DataFrame, year: int, output_dir: str, overwrite: bool = False) -> None:
    """
    Writes a DataFrame to Parquet format, partitioned by year.

    Args:
        df (pd.DataFrame): The DataFrame to write.
        year (int): The year of the data.
        output_dir (str): The directory where Parquet files will be saved.
        overwrite (bool): If True, overwrite existing files. If False, skip existing files.
    """
    try:
        year_dir = os.path.join(output_dir, f'year={year}')
        if overwrite and os.path.exists(year_dir):
            shutil.rmtree(year_dir)  # Remove existing directory to start fresh

        df['year'] = year  # Add the year column for partitioning
        # Write data to Parquet format with partitioning
        df.to_parquet(output_dir, partition_cols=['year'], index=False)
        
        print(f"Data for year {year} successfully saved to Parquet format in {output_dir}")
    except Exception as e:
        print(f"Error saving data to Parquet for year {year}: {e}")

In [37]:
def data_fetch(years_to_fetch: List[int], output_dir: str, overwrite: bool = False) -> None:
    """
    Fetches NSDUH data for specified years and saves it to Parquet format, one year at a time.

    Args:
        years_to_fetch (List[int]): A list of years for which to fetch data.
        output_dir (str): The directory where Parquet files will be saved.
        overwrite (bool): If True, overwrite existing files. If False, skip existing files.
    """
    try:
        for year in years_to_fetch:
            if not overwrite and os.path.exists(os.path.join(output_dir, f'year={year}')):
                print(f"Data for year {year} already exists. Skipping.")
                continue

            df = fetch_nsduh_data(year)
            if df is not None:
                print(f"Successfully fetched data for year: {year}")
                write_parquet(df, year, output_dir, overwrite)
                del df  # Remove the DataFrame from memory
                gc.collect()  # Force garbage collection

        print("All requested years processed.")
    except Exception as e:
        print(f"An unexpected error occurred in the data_fetch function: {e}")

In [38]:
def read_parquet(input_dir: str, years: Optional[List[int]] = None) -> Dict[int, pd.DataFrame]:
    """
    Reads Parquet files for specified years from the input directory.

    Args:
        input_dir (str): The directory where Parquet files are stored.
        years (Optional[List[int]]): A list of years to read. If None, read all available years.

    Returns:
        Dict[int, pd.DataFrame]: A dictionary with years as keys and pandas DataFrames as values.
    """
    data_frames = {}
    available_years = [int(d.split('=')[1]) for d in os.listdir(input_dir) if d.startswith('year=')]
    years_to_read = years if years is not None else available_years

    for year in years_to_read:
        year_path = os.path.join(input_dir, f'year={year}')
        if os.path.exists(year_path):
            df = pd.read_parquet(year_path)
            data_frames[year] = df
        else:
            print(f"Warning: No data found for year {year}")

    return data_frames

In [None]:
if __name__ == "__main__":
    # Example usage
    years = [2015, 2016, 2017, 2018, 2019] 
    output_directory = "../data/DS/NSDUH"
    
    # Fetch and save data
    data_fetch(years, output_directory, overwrite=False)

    # Read saved data (if needed)
    # Note: This part is optional and can be removed if you don't need to read the data immediately after saving
    for year in years:
        df = read_parquet(output_directory, [year])
        if year in df:
            print(f"Data for year {year}:")
            print(df[year].head())
        del df
        gc.collect()

### EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [7]:
df.head(100).to_csv(r"../data/sample_data.csv",index=False)

EDA done by Apurva Umredkar (50592382) using a new dataset

In [11]:
import pandas as pd
import re 
import warnings
# Suppress specific warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [12]:
# dataset URL
walkingdist_url = r"https://data.chhs.ca.gov/dataset/5e391154-f07d-4e0c-ab1b-687a0c4c5d06/resource/d8d7d188-fbf8-413e-9a7f-639d3828c17e/download/walkable_distance_to_public_transportation.xlsx"

walkingdist2pt_data = pd.read_excel(walkingdist_url)
walkingdist2pt_data.head()

Unnamed: 0,ind_id,ind_definition,reportyear,race_eth_code,race_eth_name,geotype,geotypevalue,geoname,county_name,county_fips,...,rse,SAC_decile,SAC_RR,version,MTC_decile,MTC_RR,SD_decile,SD_RR,SC_decile,SC_RR
0,_x0035_1,Percent_x0020_of_x0020_population_x0020_residi...,_x0032_008,_x0033_,AfricanAm,CO,_x0030_6061,Placer,Placer,_x0030_6061,...,13.399974,,0.07046,2013-11-26 10:22:44,,,,,,
1,_x0035_1,Percent_x0020_of_x0020_population_x0020_residi...,_x0032_008,_x0031_,AIAN,CO,_x0030_6061,Placer,Placer,_x0030_6061,...,13.830066,,0.139059,2013-11-26 10:22:44,,,,,,
2,_x0035_1,Percent_x0020_of_x0020_population_x0020_residi...,_x0032_008,_x0032_,Asian,CO,_x0030_6061,Placer,Placer,_x0030_6061,...,9.217872,,0.033239,2013-11-26 10:22:44,,,,,,
3,_x0035_1,Percent_x0020_of_x0020_population_x0020_residi...,_x0032_008,_x0034_,Latino,CO,_x0030_6061,Placer,Placer,_x0030_6061,...,2.286029,,0.232768,2013-11-26 10:22:44,,,,,,
4,_x0035_1,Percent_x0020_of_x0020_population_x0020_residi...,_x0032_008,_x0037_,Multiple,CO,_x0030_6061,Placer,Placer,_x0030_6061,...,6.368321,,0.128243,2013-11-26 10:22:44,,,,,,


**Dataset description:** This table contains data on the percent of population residing within ½ mile of a major transit stop for four California regions and the counties, cities/towns, and census tracts within the regions. The percent was calculated using data from four metropolitan planning organizations (San Diego Association of Governments, Southern California Association of Governments, Metropolitan Transportation Commission, and Sacramento Council of Governments) and the U.S. Census Bureau. The table is part of a series of indicators in the Healthy Communities Data and Indicators Project of the Office of Health Equity. A strong and sustainable transportation system supports safe, reliable, and affordable transportation opportunities for walking, bicycling, and public transit, and helps reduce health inequities by providing more opportunities for access to healthy food, jobs, health care, education, and other essential services. Active and public transportation promote health by enabling individuals to increase their level of physical activity, potentially reducing the risk of heart disease and obesity, improving mental health, and lowering blood pressure. More information about the data table and a data dictionary can be found in the About/Attachments section.

In [13]:
# Data cleaning step 1
# the data seems to be XML encoded, which is messing with the data, let's fix that
def decode_xml_encoded_values(val):
    if isinstance(val, str):
        # Detect patterns like _x0035_ and replace them with decoded values
        return re.sub(r'_x([0-9A-Fa-f]{4})_', lambda x: chr(int(x.group(1), 16)), val)
    return val

walkingdist2pt_data = walkingdist2pt_data.applymap(decode_xml_encoded_values)
walkingdist2pt_data.head()

Unnamed: 0,ind_id,ind_definition,reportyear,race_eth_code,race_eth_name,geotype,geotypevalue,geoname,county_name,county_fips,...,rse,SAC_decile,SAC_RR,version,MTC_decile,MTC_RR,SD_decile,SD_RR,SC_decile,SC_RR
0,51,Percent of population residing within ½ mile o...,2008,3,AfricanAm,CO,6061,Placer,Placer,6061,...,13.399974,,0.07046,2013-11-26 10:22:44,,,,,,
1,51,Percent of population residing within ½ mile o...,2008,1,AIAN,CO,6061,Placer,Placer,6061,...,13.830066,,0.139059,2013-11-26 10:22:44,,,,,,
2,51,Percent of population residing within ½ mile o...,2008,2,Asian,CO,6061,Placer,Placer,6061,...,9.217872,,0.033239,2013-11-26 10:22:44,,,,,,
3,51,Percent of population residing within ½ mile o...,2008,4,Latino,CO,6061,Placer,Placer,6061,...,2.286029,,0.232768,2013-11-26 10:22:44,,,,,,
4,51,Percent of population residing within ½ mile o...,2008,7,Multiple,CO,6061,Placer,Placer,6061,...,6.368321,,0.128243,2013-11-26 10:22:44,,,,,,


In [14]:
walkingdist2pt_data.info()
# the definitions of the columns are given in the following link:
# https://data.chhs.ca.gov/dataset/5e391154-f07d-4e0c-ab1b-687a0c4c5d06/resource/a13e575e-abb8-47f6-bdc1-15bceb17befc/download/walkabledistancepublictransitdd.xlsx

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66010 entries, 0 to 66009
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ind_id          66010 non-null  object        
 1   ind_definition  66006 non-null  object        
 2   reportyear      66006 non-null  object        
 3   race_eth_code   66006 non-null  object        
 4   race_eth_name   66006 non-null  object        
 5   geotype         66006 non-null  object        
 6   geotypevalue    66006 non-null  object        
 7   geoname         65952 non-null  object        
 8   county_name     65943 non-null  object        
 9   county_fips     65943 non-null  object        
 10  region_name     66006 non-null  object        
 11  region_code     66006 non-null  object        
 12  pop_trans_acc   64824 non-null  float64       
 13  pop2010         66006 non-null  float64       
 14  p_trans_acc     64449 non-null  float64       
 15  LL

In [15]:
# data cleaning step 2: dropping unwanted columns
# it is evident that all the data is for "Percent of population residing within ½ mile of a major transit stop"
# the ind_id has default value of 51 throughout the dataset
# columns ind_id, ind_definition are irrelevant
walkdist2pt_data_clean = walkingdist2pt_data.drop(columns=["ind_id", "ind_definition"], axis=1)
walkdist2pt_data_clean.head()

Unnamed: 0,reportyear,race_eth_code,race_eth_name,geotype,geotypevalue,geoname,county_name,county_fips,region_name,region_code,...,rse,SAC_decile,SAC_RR,version,MTC_decile,MTC_RR,SD_decile,SD_RR,SC_decile,SC_RR
0,2008,3,AfricanAm,CO,6061,Placer,Placer,6061,Sacramento Area,8,...,13.399974,,0.07046,2013-11-26 10:22:44,,,,,,
1,2008,1,AIAN,CO,6061,Placer,Placer,6061,Sacramento Area,8,...,13.830066,,0.139059,2013-11-26 10:22:44,,,,,,
2,2008,2,Asian,CO,6061,Placer,Placer,6061,Sacramento Area,8,...,9.217872,,0.033239,2013-11-26 10:22:44,,,,,,
3,2008,4,Latino,CO,6061,Placer,Placer,6061,Sacramento Area,8,...,2.286029,,0.232768,2013-11-26 10:22:44,,,,,,
4,2008,7,Multiple,CO,6061,Placer,Placer,6061,Sacramento Area,8,...,6.368321,,0.128243,2013-11-26 10:22:44,,,,,,


In [None]:
# data cleaning step 3
# having race ethnicity code & name is redundant, we can save the codes in a dictionary and drop the column
# county name is a subset of geoname, redundant column should be dropped
