### Data Source

This project utilizes data from the **National Survey on Drug Use and Health, 2015 (NSDUH-2015)**, provided by the Substance Abuse and Mental Health Services Administration (SAMHSA). The dataset contains detailed information on drug use, mental health, and related factors among individuals in the United States for the year 2015.

- **Dataset**: [National Survey on Drug Use and Health, 2015 (NSDUH-2015)](https://www.datafiles.samhsa.gov/dataset/national-survey-drug-use-and-health-2015-nsduh-2015-ds0001)
- **Codebook (Description of Columns)**: [NSDUH 2015 Codebook](https://www.datafiles.samhsa.gov/sites/default/files/field-uploads-protected/studies/NSDUH-2015/NSDUH-2015-datasets/NSDUH-2015-DS0001/NSDUH-2015-DS0001-info/NSDUH-2015-DS0001-info-codebook.pdf)


In [None]:
import pandas as pd
import os
from typing import List, Optional, Dict
import gc

In [2]:
# !pip3 install pandas
# !pip3 install pyarrow


### Data Fetch

In [3]:
def fetch_nsduh_data(year: int) -> Optional[pd.DataFrame]:
    """
    Fetches NSDUH data for a specified year from a remote source.

    Args:
        year (int): The year for which to fetch data.

    Returns:
        Optional[pd.DataFrame]: A pandas DataFrame with the data, or None if fetching failed.
    """
    url_placeholder = "https://www.datafiles.samhsa.gov/sites/default/files/field-uploads-protected/studies/NSDUH-{year}/NSDUH-{year}-datasets/NSDUH-{year}-DS0001/NSDUH-{year}-DS0001-bundles-with-study-info/NSDUH-{year}-DS0001-bndl-data-tsv.zip"
    
    try:
        url = url_placeholder.format(year=year)
        df = pd.read_csv(url, compression='zip', sep='\t', low_memory=False)
        return df
    except Exception as e:
        print(f"An error occurred for year {year}: {e}")
        return None

In [4]:
def check_parquet_exists(years: List[int], output_dir: str) -> Dict[int, bool]:
    """
    Checks if Parquet files for the specified years already exist in the output directory.

    Args:
        years (List[int]): A list of years to check.
        output_dir (str): The directory where Parquet files are saved.

    Returns:
        Dict[int, bool]: A dictionary with years as keys and boolean values indicating 
                         whether the Parquet file for that year exists.
    """
    existence_check = {}
    
    for year in years:
        year_path = os.path.join(output_dir, f'year={year}')
        existence_check[year] = os.path.exists(year_path)
    
    return existence_check

In [5]:
def write_parquet(df: pd.DataFrame, year: int, output_dir: str, overwrite: bool = False) -> None:
    """
    Writes a DataFrame to Parquet format, partitioned by year.

    Args:
        df (pd.DataFrame): The DataFrame to write.
        year (int): The year of the data.
        output_dir (str): The directory where Parquet files will be saved.
        overwrite (bool): If True, overwrite existing files. If False, skip existing files.
    """
    try:
        year_dir = os.path.join(output_dir, f'year={year}')
        if overwrite and os.path.exists(year_dir):
            shutil.rmtree(year_dir)  # Remove existing directory to start fresh

        df['year'] = year  # Add the year column for partitioning
        # Write data to Parquet format with partitioning
        df.to_parquet(output_dir, partition_cols=['year'], index=False)
        
        print(f"Data for year {year} successfully saved to Parquet format in {output_dir}")
    except Exception as e:
        print(f"Error saving data to Parquet for year {year}: {e}")

In [6]:
def data_fetch(years_to_fetch: List[int], output_dir: str, overwrite: bool = False) -> None:
    """
    Fetches NSDUH data for specified years and saves it to Parquet format, one year at a time.

    Args:
        years_to_fetch (List[int]): A list of years for which to fetch data.
        output_dir (str): The directory where Parquet files will be saved.
        overwrite (bool): If True, overwrite existing files. If False, skip existing files.
    """
    try:
        for year in years_to_fetch:
            if not overwrite and os.path.exists(os.path.join(output_dir, f'year={year}')):
                print(f"Data for year {year} already exists. Skipping.")
                continue

            df = fetch_nsduh_data(year)
            if df is not None:
                print(f"Successfully fetched data for year: {year}")
                write_parquet(df, year, output_dir, overwrite)
                del df  # Remove the DataFrame from memory
                gc.collect()  # Force garbage collection

        print("All requested years processed.")
    except Exception as e:
        print(f"An unexpected error occurred in the data_fetch function: {e}")

In [7]:
def read_parquet(input_dir: str, years: Optional[List[int]] = None) -> Dict[int, pd.DataFrame]:
    """
    Reads Parquet files for specified years from the input directory.

    Args:
        input_dir (str): The directory where Parquet files are stored.
        years (Optional[List[int]]): A list of years to read. If None, read all available years.

    Returns:
        Dict[int, pd.DataFrame]: A dictionary with years as keys and pandas DataFrames as values.
    """
    data_frames = {}
    available_years = [int(d.split('=')[1]) for d in os.listdir(input_dir) if d.startswith('year=')]
    years_to_read = years if years is not None else available_years

    for year in years_to_read:
        year_path = os.path.join(input_dir, f'year={year}')
        if os.path.exists(year_path):
            df = pd.read_parquet(year_path)
            data_frames[year] = df
        else:
            print(f"Warning: No data found for year {year}")

    return data_frames

In [8]:
years = [2015, 2016, 2017, 2018, 2019] 
output_directory = "../data/DS/NSDUH"

# Fetch and save data
data_fetch(years, output_directory, overwrite=False)

# Read saved data (if needed)
# Note: This part is optional and can be removed if you don't need to read the data immediately after saving
for year in years:
    df = read_parquet(output_directory, [year])
    if year in df:
        print(f"Data for year {year}:")
        print(df[year].head())
    del df
    gc.collect()

Successfully fetched data for year: 2015
Data for year 2015 successfully saved to Parquet format in ../data/DS/NSDUH
Successfully fetched data for year: 2016
Data for year 2016 successfully saved to Parquet format in ../data/DS/NSDUH
Successfully fetched data for year: 2017
Data for year 2017 successfully saved to Parquet format in ../data/DS/NSDUH
Successfully fetched data for year: 2018
Data for year 2018 successfully saved to Parquet format in ../data/DS/NSDUH
Successfully fetched data for year: 2019
Data for year 2019 successfully saved to Parquet format in ../data/DS/NSDUH
All requested years processed.
Data for year 2015:
   QUESTID2    FILEDATE  CIGEVER  CIGOFRSM  CIGWILYR  CIGTRY  CIGYFU  CIGMFU  \
0  25095143  02/15/2018        1        99        99      16    2014       1   
1  13005143  02/15/2018        1        99        99      15    9999      99   
2  67415143  02/15/2018        2        99        99     991    9991      91   
3  70925143  02/15/2018        2         3  

### Loading a second dataset: Study of Online Gaming and it's affect on Mental Health (Anxiety).
**About this dataset**

This dataset consists of data collected as a part of a survey among gamers worldwide. The questionnaire asked questions that psychologists generally ask people who are prone to anxiety, social phobia, and less to no life satisfaction. The questionnaire consists of several set of questions as asked as a part of psychological study. The original data was collated by Marian Sauter and Dejan Draschkow.

Kaggle dataset source: https://www.kaggle.com/datasets/divyansh22/online-gaming-anxiety-data

In [10]:
import requests, zipfile, io
import pandas as pd
# URL to direct download the dataset
# We will need to later modify this approach to use Kaggle API with authentication key. 
# Need to figure out to securely implement this approach - target for phase 2

url = 'https://storage.googleapis.com/kaggle-data-sets/820200/1403222/compressed/GamingStudy_data.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241008%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241008T143813Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1598505283cac2bfb3360235fe96f97ff4c110c3b2a70d1e04aa5c96f5d6ba3b7270ee85e73d7901c33feb8b3f872e9e212f9076f180a089c899c32cb879677885c9e55f22e2ea3f30a0dcd288ba9d759ef254388b2d752888679809085bb057c1f152ba976260333205e16131c02ab715ce1a2cc9b0fa06cf06206ed967fae11fdefbe37c30a9574fad339b2a83213ac9ef1400bd17a2415884d71c577e03afdf821478cfd03449d8fbce3779f83a8b323adf448995e59c37d8704a9327ad8614074685b89c149ca6cda7d4cd7c7c31fec916383d659745ac7f88f5786ed28bbbf2f1e0fd4e405765906c40239659f8aae9b044339ceed46aece7376b990210'

# Download the file
response = requests.get(url)

# Check if the download was successful
if response.status_code == 200:
    # Create a file-like object from the response content
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    
    # Iterate over the files in the zip archive
    for file_name in zip_file.namelist():
        with zip_file.open(file_name) as extracted_file:
            # Read the content of the extracted file
            gaming_dat = pd.read_csv(extracted_file, encoding='ISO-8859-1')
            gaming_dat.to_csv(r"../data/GamingStudy_data.csv", index=False)
            
    print('File downloaded and extracted successfully!')
else:
    print('Using the file stored locally in the folder named data')
    gaming_dat = pd.read_csv(r"./data/gamedata.csv", encoding='ISO-8859-1')

gaming_dat.head()

File downloaded and extracted successfully!


Unnamed: 0,S. No.,Timestamp,GAD1,GAD2,GAD3,GAD4,GAD5,GAD6,GAD7,GADE,...,Birthplace,Residence,Reference,Playstyle,accept,GAD_T,SWL_T,SPIN_T,Residence_ISO3,Birthplace_ISO3
0,1,42052.00437,0,0,0,0,1,0,0,Not difficult at all,...,USA,USA,Reddit,Singleplayer,Accept,1,23,5.0,USA,USA
1,2,42052.0068,1,2,2,2,0,1,0,Somewhat difficult,...,USA,USA,Reddit,Multiplayer - online - with strangers,Accept,8,16,33.0,USA,USA
2,3,42052.0386,0,2,2,0,0,3,1,Not difficult at all,...,Germany,Germany,Reddit,Singleplayer,Accept,8,17,31.0,DEU,DEU
3,4,42052.06804,0,0,0,0,0,0,0,Not difficult at all,...,USA,USA,Reddit,Multiplayer - online - with online acquaintanc...,Accept,0,17,11.0,USA,USA
4,5,42052.08948,2,1,2,2,2,3,2,Very difficult,...,USA,South Korea,Reddit,Multiplayer - online - with strangers,Accept,14,14,13.0,KOR,USA
