# Past DHS Data Dowload Script
This notebook should only be run only once to dowload all past dhs data relevant to Plan-EO. All other DHS data will be downloaded in a different script. 
We will have another script for updating the new DHS data as it comes.

This notebook may take a long time to run due to the volume of data. (Although during tests it is quite fast)

The notebook will run through the dowload URLS provided by DHS for all past surveys and output stata files or flat files for GPS data to their designated country folder and appropriate survey subfolders

# Setup

## Import Packages

In [2]:
#import packages
import pandas as pd
import numpy as np
import requests
import zipfile
import os
from pathlib import Path
import re
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import sys
from bs4 import BeautifulSoup

## Connect to DHS Site and Log In

In [3]:
# Configure Chrome to run in headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model (needed for some HPC environments)
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource issues

# Initialize WebDriver
driver = webdriver.Chrome(options=chrome_options)
#open login page
driver.get("https://dhsprogram.com/data/dataset_admin/")

#wait for page to load
wait = WebDriverWait(driver, 10)

#locate email and password fields and enter credentials
email_field = driver.find_element(By.NAME, "UserName")
password_field = driver.find_element(By.NAME, "UserPass")

#input email and password
#I couldn't figure out how to get an .env hidden file on rivanna so the email and password are directly in the notebook
email_field.send_keys("joshcolston@gmail.com")
password_field.send_keys("Pl@n-EO2025")  
password_field.send_keys(Keys.RETURN)  # Press Enter to submit

time.sleep(5)

# Now on the project selection page

# Locate the dropdown element 
dropdown = Select(driver.find_element(By.NAME, "proj_id")) 
dropdown.select_by_value("205516") 
#print(driver.page_source)


# Wait for navigation
time.sleep(3)

wait = WebDriverWait(driver, 5)

# Scroll to the "Download Manager" button
download_button = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@type='submit' and @value='Download Manager']")))

# Scroll to the button
driver.execute_script("arguments[0].scrollIntoView(true);", download_button)

# Explicitly wait for the button to be clickable
download_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@type='submit' and @value='Download Manager']")))

# Try clicking the button
try:
    download_button.click()
except Exception as e:
    print(f"Error clicking the button: {e}")
    
    # If normal click fails, try clicking via JavaScript as a fallback
    driver.execute_script("arguments[0].click();", download_button)


# List of checkbox values (adjusted from labels to actual checkbox values)
checkbox_values = ["KR", "CR", "IR", "GE", "FL", "dt", "all-dhs"]

#save cookies from session to use with api call
selenium_cookies = driver.get_cookies()

#transfer cookies to the session for the api call
session = requests.Session()
for cookie in selenium_cookies:
    session.cookies.set(cookie['name'], cookie['value'])

## Read in Dowlnoad URLS

In [4]:
#read in the list of urls to dowload files for stata and gps files

# specify file path
file_path_statas = "DHS_files/urlslist_KR_IR_HR_stata.txt"
file_path_gps = "DHS_files/urlslist_GPS_flat.txt"
file_path_nonplaneo = "DHS_files/urlslist_DHS_all_not_planeo_stata.txt"


# Initialize an empty list to store URLs
urls = []

# Function to read URLs from a file
def read_urls_from_file(file_path):
    try:
        with open(file_path, 'r') as file:
            # Split the content into lines and remove any empty lines
            return [line.strip() for line in file.readlines() if line.strip()]
    except FileNotFoundError:
        print(f"Failed to retrieve data: File not found at {file_path}")
        return []
    except Exception as e:
        print(f"An error occurred while reading {file_path}: {e}")
        return []

# Read URLs from both files and combine them
urls.extend(read_urls_from_file(file_path_statas))
urls.extend(read_urls_from_file(file_path_gps))
urls.extend(read_urls_from_file(file_path_nonplaneo))

# Print the total number of URLs found and the first few URLs to verify
print(f"Found {len(urls)} URLs")
print("\nFirst 3 URLs:")
for url in urls[:3]:
    print(url)

Found 2921 URLs

First 3 URLs:
https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFHR71DT.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm
https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFIR71DT.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm
https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFKR71DT.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm


In [5]:
# read in file of remaining urls for non stata file formats
file_path_not_stata = "DHS_files/urlslist_DHS_all_not_stata.txt"
# Initialize an empty list to store URLs
urls2 = []

#read in URLS
urls2.extend(read_urls_from_file(file_path_not_stata))

             
# Print the total number of URLs found and the first few URLs to verify
print(f"Found {len(urls2)} URLs")
print("\nFirst 3 URLs:")
for url in urls2[:3]:
    print(url)            

Found 9023 URLs

First 3 URLs:
https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFBR71FL.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm
https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFBR71SD.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm
https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFBR71SV.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm


## Connect to APIs to Get Necessary Metadata

In [54]:
# get country names corresponding to each 2 letter country code from the DHS API

#refresh page to make sure api connection works

# Refresh the page to get a new session
driver.refresh()  
# Get new cookies
selenium_cookies = driver.get_cookies() 
# Clear old cookies
session.cookies.clear()  
# Update with fresh cookies
for cookie in selenium_cookies:
    session.cookies.set(cookie['name'], cookie['value'])  


#URL for country endpoint
country_url = "http://api.dhsprogram.com/rest/dhs/countries"

#read content from URL
r_country = session.get(country_url)

#check status to seee if we could access API
if r_country.status_code == 200:
    #get JSON data from API
    countries = json.loads(r_country.text)
    #convert JSON to dataframe
    countries_df = pd.DataFrame(countries['Data'])

#return error if we cannot access API
else:
    print(f"Failed to retrieve data: {r_country.status_code}")
                                

In [55]:
#extract just DHS country code and country name from the countries dataframe
country_code_name = countries_df[['DHS_CountryCode', 'CountryName', 'ISO3_CountryCode']]
country_code_name.head()

Unnamed: 0,DHS_CountryCode,CountryName,ISO3_CountryCode
0,AF,Afghanistan,AFG
1,AL,Albania,ALB
2,AO,Angola,AGO
3,AM,Armenia,ARM
4,AZ,Azerbaijan,AZE


In [8]:
# query survey API to get the correct survey information for file storage

driver.refresh()  # Refresh the page to get a new session
selenium_cookies = driver.get_cookies()  # Get new cookies
session.cookies.clear()  # Clear old cookies
for cookie in selenium_cookies:
    session.cookies.set(cookie['name'], cookie['value'])  # Update with fresh cookies


#URL for survey endpoint
survey_url = "https://api.dhsprogram.com/rest/dhs/surveys"



#read content from URL
r_survey = session.get(survey_url)

#check status to seee if we could access API
if r_survey.status_code == 200:
    #get JSON data from API
    survey = json.loads(r_survey.text)
    #convert JSON to dataframe
    survey_df = pd.DataFrame(survey['Data'])
#return error if we cannot access API
else:
    print(f"Failed to retrieve data: {r_survey.status_code}")

In [9]:
survey_df_info = survey_df[['SurveyNum', 'DHS_CountryCode', 'SurveyId']]
survey_df_info.head()

Unnamed: 0,SurveyNum,DHS_CountryCode,SurveyId
0,471,AF,AF2015DHS
1,327,AL,AL2008DHS
2,525,AL,AL2017DHS
3,282,AO,AO2006MIS
4,395,AO,AO2011MIS


In [10]:
# get data from datasets enpoint because survey endpoint was missing some information

#URL for dataset endpoint
dataset_url = "https://api.dhsprogram.com/rest/dhs/datasets"



#read content from URL
r_dataset = session.get(dataset_url)

#check status to seee if we could access API
if r_dataset.status_code == 200:
    #get JSON data from API
    dataset = json.loads(r_dataset.text)
    #convert JSON to dataframe
    dataset_df = pd.DataFrame(dataset['Data'])
#return error if we cannot access API
else:
    print(f"Failed to retrieve data: {r_dataset.status_code}")

In [11]:
dataset_df_i = dataset_df[['SurveyNum', 'DHS_CountryCode', 'SurveyId']].drop_duplicates()
dataset_df_i.head()

Unnamed: 0,SurveyNum,DHS_CountryCode,SurveyId
0,348,AF,AF2010OTH
21,471,AF,AF2015DHS
56,543,AF,AF2018SPA
113,327,AL,AL2008DHS
145,525,AL,AL2017DHS


In [12]:
# filter out survey numbers not present in df from survey endpoint
dataset_filtered = dataset_df_i[~dataset_df_i['SurveyNum'].isin(survey_df_info['SurveyNum'])]

#concatenate filtered df with survey df
dataset_df_info = pd.concat([survey_df_info, dataset_filtered], ignore_index = True)
dataset_df_info.head()

Unnamed: 0,SurveyNum,DHS_CountryCode,SurveyId
0,471,AF,AF2015DHS
1,327,AL,AL2008DHS
2,525,AL,AL2017DHS
3,282,AO,AO2006MIS
4,395,AO,AO2011MIS


# Functions to Download and Extract Data

In [13]:
# function to extract country code from the download URL
def extract_country_code(url):
    # Extract country code from URL (assuming format like "Filename=AFHR71FL.zip" where AF is country code)
    match = re.search(r'Filename=([A-Z]{2})', url)
    if match:
        #return the country code
        return match.group(1)
    #if it was unable to extract the country code, return unknown
    return "unknown"

In [14]:
#function to extract full file name from download URL
def extract_file_name(url):
    match = re.search (r'Filename=([A-Za-z0-9_]+\.zip)', url)
    if match:
        #return the filename
        return match.group(1)
    #if it was unable to extract the country code, return unknown
    return "unknown"

In [15]:
#extract survey number from the download URL
def extract_survey_num(url):
    # Extract surv_id from URL
    match = re.search(r'surv_id=(\d+)', url)
    if match:
        # Return the extracted survey ID
        return int(match.group(1))
    # If unable to extract the survey ID, return "unknown"
    return "unknown"

In [16]:
# get the Indian State codes

ind_state_codes = pd.read_csv('DHS_files/Ind_state_codes.csv')[['Code','State Name']]
ind_state_codes = ind_state_codes.fillna('NA')
ind_state_codes.head()


Unnamed: 0,Code,State Name
0,AP,Andhra Pradesh
1,AS,Assam
2,DL,Delhi
3,GO,Goa
4,HR,Haryana


In [17]:
# add indian state codes to the Country_code_name data frame

#make data frame with India country name and ISO3 country code
new_rows = pd.DataFrame({'DHS_CountryCode' : ind_state_codes['Code'].tolist(),
                         'CountryName' : 'India',
                         'ISO3_CountryCode' : 'IND'})

#concatenate with country code name
country_code_name = pd.concat([country_code_name, new_rows], ignore_index = True)

In [18]:
#function to download the data

def download_and_extract(url):
    #try to download the file
    try:
        #download file
        response = session.get(url)
        # Debug information
        print(f"Response status code: {response.status_code}")
        #print if we are having issues
        if response.status_code != 200:
            print(f"Response headers: {response.headers}")
            print(f"Content type: {response.headers.get('content-type')}")
        else:
            print("Successfull request to server")
        
        if response.status_code != 200:
            print(f"Failed to download: {url}")
            return
        
        #check status code and return error if the download is not available
        if response.status_code != 200:
            print(f"failed to download: {url}")
            return
        
        if 'text/html' in response.headers.get('content-type', ''):
            print(f"Received HTML instead of zip file. Content: {response.text[:200]}")
            return
        
        #extract the country code from the URL
        country_code = extract_country_code(url)
        
        #get country name from dataframe of country codes and names
        country_name = country_code_name[country_code_name['DHS_CountryCode'] == country_code]['CountryName'].iloc[0]
        # if we can't find the country name print error and return
        if pd.isna(country_name):
            print(f'No matching country name for country code from: {url}')
            return
        
        
        #get 3 letter country code
        ISO3_country_code = country_code_name[country_code_name['DHS_CountryCode'] == country_code]['ISO3_CountryCode'].iloc[0]

        #extract file name from url
        file_name = extract_file_name(url)
        #print error is file uanble to be extracted
        if file_name =='unknown':
            print(f'Failed to extract filename from: {url}')

        #extract survey number from url
        survey_num = extract_survey_num(url)

        #find corresponding SurveyID
        survey_id_row = dataset_df_info[dataset_df_info['SurveyNum']== survey_num]
        if not survey_id_row.empty:
            original_survey_id = survey_id_row['SurveyId'].iloc[0]
            #replace the first 2 letters of the SurveyID with the IS03 country code
            survey_id = ISO3_country_code + original_survey_id[2:]
        #print error if survey Id not found
        else:
            print(f"survey ID not found for file: {file_name}")
            return
        
        #format directory name as XXX_Country_Name
        dir_name = f"{ISO3_country_code}_{'_'.join(country_name.split())}"

        #create path to survey_data directory
        survey_dir = Path("..") / "Individual_country_data"/ dir_name / "03_Survey_data" / survey_id
        
        # Check if the directory exists, and create it if it doesn't
        survey_dir.mkdir(parents=True, exist_ok=True)
        
        #save zip file temporarily
        zip_path = survey_dir /"temp.zip"
        with open(zip_path, 'wb') as f:
            f.write(response.content)

        #create a sub directory named for the zip file
        zip_subdir = survey_dir / file_name[:-4] #removes the .zip from the file name

        #create a subdirectory for the extracted contents
        zip_subdir.mkdir(parents = True, exist_ok = True)
        
        #extract zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(zip_subdir)
            
        #remove temporary zip file
        zip_path.unlink()
        
        #print success message
        print(f"Successfully processed {file_name} file from: {url}")
    
    #print exception message
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

### New Function to Download the Remaining Error Files

In [19]:
#function to download the missing data
def download_and_extract_missing(url, merged_df):
    #try to download the file
    try:
        #download file
        response = session.get(url)
        # Debug information
        print(f"Response status code: {response.status_code}")
        #print if we are having issues
        if response.status_code != 200:
            print(f"Response headers: {response.headers}")
            print(f"Content type: {response.headers.get('content-type')}")
        else:
            print("Successfull request to server")
        
        if response.status_code != 200:
            print(f"Failed to download: {url}")
            return
        
        #check status code and return error if the download is not available
        if response.status_code != 200:
            print(f"failed to download: {url}")
            return
        
        if 'text/html' in response.headers.get('content-type', ''):
            print(f"Received HTML instead of zip file. Content: {response.text[:200]}")
            return
        
       
        #extract file name from url
        file_name = extract_file_name(url)
        #print error is file uanble to be extracted
        if file_name =='unknown':
            print(f'Failed to extract filename from: {url}')
    
        #look up matching row in merged_df table
        survey_id_row = merged_df[merged_df['file_name']== file_name]
        
        
        ISO3_country_code = survey_id_row['ISO3'].iloc[0]
        country_name = survey_id_row['Country'].iloc[0]
        survey_id = survey_id_row['folder'].iloc[0]
        
    
        #format directory name as XXX_Country_Name
        dir_name = f"{ISO3_country_code}_{'_'.join(country_name.split())}"
        
        
        #create path to survey_data directory
        survey_dir = Path("..") / "Individual_country_data"/ dir_name / "03_Survey_data" / survey_id
        
        # Check if the directory exists, and create it if it doesn't
        survey_dir.mkdir(parents=True, exist_ok=True)
        
        #save zip file temporarily
        zip_path = survey_dir /"temp.zip"
        with open(zip_path, 'wb') as f:
            f.write(response.content)

        #create a sub directory named for the zip file
        zip_subdir = survey_dir / file_name[:-4] #removes the .zip from the file name

        #create a subdirectory for the extracted contents
        zip_subdir.mkdir(parents = True, exist_ok = True)
        
        #extract zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(zip_subdir)
            
        #remove temporary zip file
        zip_path.unlink()
        
        #print success message
        print(f"Successfully processed {file_name} file from: {url}")
    
    #print exception message
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

# Function to Read Error Output Files

In [20]:
def parse_error_log(file_path):
    
    # Read the text file
    with open(file_path, "r") as file:
        lines = file.readlines()
        
    # Lists to store extracted data
    data = []
    url, response_code, status, error = None, None, None, None

    for line in lines:
        line = line.strip()

        # Detect processing line and extract URL
        if line.startswith("processing "):
            url = line.split("processing ")[1]
            response_code, status, error = None, None, None  # Reset for new entry

        # Extract response code
        elif "Response status code:" in line:
            match = re.search(r"Response status code: (\d+)", line)
            if match:
                response_code = match.group(1)

        # Detect success or failure
        elif "Successfully processed" in line:
            status = "Success"
        elif "Error processing" in line or "survey ID not found" in line:
            status = "Failure"
            error = line  

        # If we have a complete entry, store it
        if url and response_code and status:
            data.append([url, response_code, status, error])
            url, response_code, status, error = None, None, None, None  

    #return it as a dataframe
    return pd.DataFrame(data, columns=["URL", "Response Code", "Status", "Error"])

# Download Stata and GPS Files

**Do Not Rerun This Code**

## Test Code

In [None]:
test_url = "https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AFHR71DT.zip&Tp=1&Ctry_Code=AF&surv_id=471&dm=1&dmode=nm"
download_and_extract(test_url)

In [None]:
#test 5 at a time:
for url in urls[1:6]:
    print(f"processing {url}")
    download_and_extract(url)

## Cycle Through All Download Links 1

**Do Not Rerun This Code**

In [16]:
#cycle over the rest of the urls
for url in urls[6:len(urls)+1]:
    print(f"processing {url}")
    download_and_extract(url)

processing https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AOIR51DT.zip&Tp=1&Ctry_Code=AO&surv_id=282&dm=1&dmode=nm
Response status code: 200
Successfull request to server
Successfully processed AOIR51DT.zip file from: https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AOIR51DT.zip&Tp=1&Ctry_Code=AO&surv_id=282&dm=1&dmode=nm
processing https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AOIR62DT.zip&Tp=1&Ctry_Code=AO&surv_id=395&dm=1&dmode=nm
Response status code: 200
Successfull request to server
Successfully processed AOIR62DT.zip file from: https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AOIR62DT.zip&Tp=1&Ctry_Code=AO&surv_id=395&dm=1&dmode=nm
processing https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=AOIR71DT.zip&Tp=1&Ctry_Code=AO&surv_id=477&dm=1&dmode=nm
Response status code: 200
Successfull request to server
Successfully processed AOIR71DT.zip file from: htt

## Second Download Attempt to Handle Failed Downloads

**Do Not Rerun This Code**

In [None]:
# cycle through the list of failures agian, now that function have been updated
#save output of function to file

with open("DHS_Error_logs/error_output_2.txt", "a") as f:  # Use "a" if you want to append instead of overwrite
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in fails:
        print(f"processing {url}")
        download_and_extract(url)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to error_output_2.txt.")


## Third Download Attempt to Handle Failed Downloads

**Do Not Rerun This Code**

In [None]:
#run through fails2 URLS with updated dataset_df_info df that has added survey IDs and return output to text file
with open("DHS_Error_logs/error_output_3.txt", "a") as f:  # Use "a" if you want to append instead of overwrite
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in fails2:
        print(f"processing {url}")
        download_and_extract(url)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to output_3.txt.")

## Fourth Download Attempt to Handle Failed Downloads
**Do Not Rerun This Code**

In [78]:
#run through fails3 URLS with updated indian states df  and return output to text file
with open("DHS_Error_logs/error_output_4.txt", "a") as f: 
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in fails3:
        print(f"processing {url}")
        download_and_extract(url)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to error_output_4.txt.")

Function outputs saved to error_output_4.txt.


## Fith Download Attempt to Handle Failed Downloads
**Do Not Rerun Code**

In [109]:
# run it again to correctly read in all indian states
#run through fails4 URLS with updated indian states df  and return output to text file
with open("DHS_Error_logs/error_output_5.txt", "a") as f: 
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in fails4:
        print(f"processing {url}")
        download_and_extract(url)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to error_output_5.txt.")

Function outputs saved to error_output_5.txt.


## Test Missing Download Function

In [64]:
missing_urls = merged_df['URL'].tolist()
missing_urls

['https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=MLHR61DT.zip&Tp=1&Ctry_Code=ML&surv_id=387&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=OSIR01DT.zip&Tp=1&Ctry_Code=OS&surv_id=8&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=OSKR01DT.zip&Tp=1&Ctry_Code=OS&surv_id=8&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=TRHR71DT.zip&Tp=1&Ctry_Code=TR&surv_id=548&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=TRIR71DT.zip&Tp=1&Ctry_Code=TR&surv_id=548&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=TRKR71DT.zip&Tp=1&Ctry_Code=TR&surv_id=548&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=HTGE6AFLSR.zip&Tp=2&Ctry_Code=HT&surv_id=442&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_da

In [84]:
test_url = 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=MLHR61DT.zip&Tp=1&Ctry_Code=ML&surv_id=387&dm=1&dmode=nm'
download_and_extract_missing(test_url)

Response status code: 200
Successfull request to server
Successfully processed MLHR61DT.zip file from: https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=MLHR61DT.zip&Tp=1&Ctry_Code=ML&surv_id=387&dm=1&dmode=nm


## Download Remaining Failed Files (6th Attempt)
**Do Not Rerun Code**

In [86]:
#run through the new download_and_extract functions for manually fixed download errors and output to text file
with open("DHS_Error_logs/error_output_6.txt", "a") as f: 
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in missing_urls[1:len(missing_urls)+1]:
        print(f"processing {url}")
        download_and_extract_missing(url, merged_df)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to error_output_6.txt.")

Function outputs saved to error_output_6.txt.


# Read Error Logs and Address Download Errors of Stata and GPS Files

## Deal with Download Errors 1

In [21]:
df = parse_error_log("DHS_Error_logs/error_output_1")

In [22]:
#get the urls for the failures and save to a list
fails = df[df['Status'] == 'Failure']['URL'].tolist()


## Deal with Download Errors 2

In [23]:
df2 = parse_error_log("DHS_Error_logs/error_output_2.txt")

In [24]:
remaining_failures = len(df2[df2['Status'] == 'Failure'])
print(f'There are still {remaining_failures} files that have failed to download.')

There are still 245 files that have failed to download.


In [25]:
#get list of urls that failed to download correctly
fails2 = df2[df2['Status'] == 'Failure']['URL'].tolist()

In [26]:
fails2[:5]

['https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=DRHR5ADT.zip&Tp=1&Ctry_Code=DR&surv_id=331&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=DRHR6ADT.zip&Tp=1&Ctry_Code=DR&surv_id=490&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=DRIR5ADT.zip&Tp=1&Ctry_Code=DR&surv_id=331&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=DRIR6ADT.zip&Tp=1&Ctry_Code=DR&surv_id=490&dm=1&dmode=nm',
 'https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename=DRKR5ADT.zip&Tp=1&Ctry_Code=DR&surv_id=331&dm=1&dmode=nm']

In [27]:
# check missing survey numbers
survey_num_miss = []
for url in fails2:
    survey_num_miss.append(extract_survey_num(url))

#print the missing survey numbers
print(len(set(survey_num_miss))) 

survey_num_miss_unique = set(survey_num_miss)
    

33


In [28]:
# check for matches between the missing survey numbers and the survey numbers from the "surveys" API endpoint
exists = dataset_df_info[dataset_df_info['SurveyNum'].isin(survey_num_miss_unique)]['SurveyNum'].tolist()
print(f'The following survey ids should have downloaded correctly: {exists}')

The following survey ids should have downloaded correctly: [50, 156, 331, 490]


## Deal with Download Errors 3

In [29]:
# check the error_output_3 for errors

df3 = parse_error_log("DHS_Error_logs/error_output_3.txt")

In [30]:
fails3 = df3[df3['Status'] == 'Failure']['URL'].tolist()
len(fails3)

230

In [31]:
#check missing survey numbers again
survey_num_miss = []
for url in fails3:
    survey_num_miss.append(extract_survey_num(url))

#print the missing survey numbers
print(len(set(survey_num_miss))) 

survey_num_miss_unique = set(survey_num_miss)
print(survey_num_miss_unique)

31
{513, 387, 133, 8, 10, 400, 401, 532, 535, 281, 156, 544, 548, 422, 166, 424, 552, 561, 50, 442, 444, 67, 458, 75, 460, 335, 84, 213, 347, 221, 496}


In [32]:
#extract countries from failed downloads
country_code_miss =[]

for url in fails3:
    country_code_miss.append(extract_country_code(url))

unique_ccs = (set(country_code_miss))

missing_countries = []
no_code = []

for cc in unique_ccs:
    filters = country_code_name[country_code_name['DHS_CountryCode'] == cc]
    if filters.empty:
        print(f"country code {cc} not found")
        no_code.append(cc)
    else:
        missing_countries.append(filters['CountryName'].iloc[0])
    
no_code

[]

## Deal with Download Errors 4

In [33]:
# check the error_output_4 for errors

df4 = parse_error_log("DHS_Error_logs/error_output_4.txt")

# Display the number of failures
print(len(df4[df4['Status'] == 'Failure']))

64


In [34]:
fails4 = df4[df4['Status'] == 'Failure']['URL'].tolist()

Errors for a few reasons still:

1: missread in 'NA' indian state code as NaN  
2: some surveys can't find ID   
3: some of these are 'Special Surveys'  


## Deal With Download Errors 5

In [35]:
#check error output

df5 = parse_error_log("DHS_Error_logs/error_output_5.txt")


# Display the number of failures
print(len(df5[df5['Status'] == 'Failure']))

56


In [36]:
fails5 = df5[df5['Status'] == 'Failure']
fails5.head()

Unnamed: 0,URL,Response Code,Status,Error
3,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: MLHR61DT.zip
4,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: OSIR01DT.zip
5,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: OSKR01DT.zip
6,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: TRHR71DT.zip
7,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: TRIR71DT.zip


In [37]:
country_fails = []

for url in fails5['URL']:
    country_fails.append(extract_country_code(url))

In [38]:
len(country_fails)

56

In [39]:
c_name = []
for cc in country_fails:
    c_name.append(country_code_name[country_code_name['DHS_CountryCode'] == cc]['CountryName'].iloc[0])
len(c_name)

56

### Manually Fix Remaining Download Errors

The remianing download errors are due to survey numbers in the download links not being present in the surveys API.   
Other methods to try to match these surveys to their metadata also failed.  
I had to fix this by manually searching through the DHS website to match file names to get the respective survey year and survey type.  
Interestingly, the surveys that were missing survey IDs were frequently special surveys or SPA geographic survey files. I do not know why this is the case it is just a pattern I found.

In [40]:
files = []
cc = []
survey_id = []

for url in fails5['URL']:
    files.append(extract_file_name(url))
    dhscc = extract_country_code(url)
    cc.append(country_code_name[country_code_name['DHS_CountryCode'] == dhscc]['ISO3_CountryCode'].iloc[0])
    survey_id.append(extract_survey_num(url))


missing_df = pd.DataFrame({'file_name':files, 'Country':c_name})

#ISO3 = country_code_name[country_code_name['DHS_CountryCode'] == cc]['ISO3_CountryCode'].iloc[0]
missing_df['ISO3'] = cc
missing_df['Survey_num'] = survey_id

missing_df.sort_values(by = 'Country').head()

Unnamed: 0,file_name,Country,ISO3,Survey_num
6,HTGE6AFLSR.zip,Haiti,HTI,442
7,HTGE7BFLSR.zip,Haiti,HTI,532
24,IDOD7ADT.zip,Indonesia,IDN,544
23,IDOD6CDT.zip,Indonesia,IDN,460
22,IDOD5ADT.zip,Indonesia,IDN,335


In [41]:
#read in missing files dataset
#this is a dataset i created by hand for surveys who's survey ID was not available for some reason
missing_data = pd.read_csv('DHS_files/Missing_files.csv')
missing_data.head()

Unnamed: 0.1,Unnamed: 0,file_name,Country,ISO3,folder,Survey_num
0,6,HTGE6AFLSR.zip,Haiti,HTI,HTI2013SPA,442
1,7,HTGE7BFLSR.zip,Haiti,HTI,HTI2017SPA,532
2,24,IDOD7ADT.zip,Indonesia,IDN,IND2017Special,544
3,23,IDOD6CDT.zip,Indonesia,IDN,IND2012Special,460
4,22,IDOD5ADT.zip,Indonesia,IDN,IND2007Special,335


In [42]:
#combine the 2 data frames based on file names

#get the file names
files = []
for url in fails5['URL']:
    files.append(extract_file_name(url))
    
#add file name column to fails5
fails5['file_name'] = files

#combine the 2 dataframes
merged_df = pd.merge(fails5, missing_data, on = "file_name")
merged_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fails5['file_name'] = files


Unnamed: 0.1,URL,Response Code,Status,Error,file_name,Unnamed: 0,Country,ISO3,folder,Survey_num
0,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: MLHR61DT.zip,MLHR61DT.zip,0,Mali,MLI,MLI2010Special,387
1,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: OSIR01DT.zip,OSIR01DT.zip,1,Nigeria (Ondo State),NGA,NGA1986Special,8
2,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: OSKR01DT.zip,OSKR01DT.zip,2,Nigeria (Ondo State),NGA,NGA1986Special,8
3,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: TRHR71DT.zip,TRHR71DT.zip,3,Turkey,TUR,TUR2018DHS,548
4,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: TRIR71DT.zip,TRIR71DT.zip,4,Turkey,TUR,TUR2018DHS,548


# Download Non-Stata Files

**Do Not Rerun This Code**

## First Runthrough: Cycle Through All Download Links

**Do Not Rerun This Code**

In [46]:
#cycle through non stata file download links and output errors to text files
with open("DHS_Error_logs/error_log1_DHS.txt", "a") as f: 
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in urls2:
        print(f"processing {url}")
        download_and_extract(url)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to error_log1_DHS.txt")

Function outputs saved to error_log1_DHS.txt


## Second Runthrough: Download with "Missing" download function

**Do Not Rerun This Code**

In [56]:
#cycle through non stata file download links and output errors to text files
with open("DHS_Error_logs/error_log2_DHS.txt", "a") as f: 
    original_stdout = sys.stdout
    sys.stdout = f  # Redirect stdout to file
    
    for url in missing_urls2:
        print(f"processing {url}")
        download_and_extract_missing(url, error_df)

    sys.stdout = original_stdout  # Restore stdout

print("Function outputs saved to error_log2_DHS.txt")

Function outputs saved to error_log2_DHS.txt


# Read Error Logs and Address Download Errors of Non-Stata Files

## Deal With Download Errors 1

In [43]:
# read in download log 
error1 = parse_error_log("DHS_Error_logs/error_log1_DHS.txt")

In [44]:
failed_num = len(error1[error1['Status'] == 'Failure'])
print(f"There were {failed_num} files that did not download.")

There were 147 files that did not download.


In [45]:
error1_fails = error1[error1['Status'] == 'Failure']
error1_fails.head()

Unnamed: 0,URL,Response Code,Status,Error
4244,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: IDOD4A.zip
4245,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: IDOD4AFL.zip
4246,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: IDOD4ASD.zip
4247,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: IDOD4ASV.zip
4248,https://dhsprogram.com/customcf/legacy/data/do...,200,Failure,survey ID not found for file: IDOD5AFL.zip


The source of the error for all of these files is "survey ID not found". This likely means we are facing the same issue as before with missing metadata.   
I will start by seeing if any of the survey IDs missing here are the same as the ones from above. 

In [46]:
# get list of remaining URLs to download
missing_urls2 = error1_fails['URL'].tolist()

In [47]:
#get country codes

country_fails = []

for url in missing_urls2:
    country_fails.append(extract_country_code(url))

In [48]:
#get country names
c_name = []
for cc in country_fails:
    c_name.append(country_code_name[country_code_name['DHS_CountryCode'] == cc]['CountryName'].iloc[0])
len(c_name)

147

In [49]:
files = []
cc = []
survey_id = []

for url in error1_fails['URL']:
    files.append(extract_file_name(url))
    dhscc = extract_country_code(url)
    cc.append(country_code_name[country_code_name['DHS_CountryCode'] == dhscc]['ISO3_CountryCode'].iloc[0])
    survey_id.append(extract_survey_num(url))


error_df = pd.DataFrame({'file_name':files, 'Country':c_name})

#ISO3 = country_code_name[country_code_name['DHS_CountryCode'] == cc]['ISO3_CountryCode'].iloc[0]
error_df['ISO3'] = cc
error_df['Survey_num'] = survey_id

error_df.sort_values(by = 'Country').head(5)



Unnamed: 0,file_name,Country,ISO3,Survey_num
0,IDOD4A.zip,Indonesia,IDN,221
13,IDOD7ASV.zip,Indonesia,IDN,544
12,IDOD7ASD.zip,Indonesia,IDN,544
11,IDOD7AFL.zip,Indonesia,IDN,544
10,IDOD6CSV.zip,Indonesia,IDN,460


In [50]:
#missing values are for OS which is nigeria ondo state

#check country code for nigeria
country_code_name[country_code_name['CountryName'] == 'Nigeria']

Unnamed: 0,DHS_CountryCode,CountryName,ISO3_CountryCode
61,NG,Nigeria,NGA


In [51]:
# Replace empty strings with NaN
error_df['Country'].replace('', np.nan, inplace=True)
error_df['ISO3'].replace('', np.nan, inplace = True)

# Fill NaN values with nigeria
error_df['Country'].fillna('Nigeria', inplace=True)
error_df['ISO3'].fillna('NGA', inplace = True)

In [52]:
error_df.sort_values(by = 'Country').head(5)

Unnamed: 0,file_name,Country,ISO3,Survey_num
0,IDOD4A.zip,Indonesia,IDN,221
13,IDOD7ASV.zip,Indonesia,IDN,544
12,IDOD7ASD.zip,Indonesia,IDN,544
11,IDOD7AFL.zip,Indonesia,IDN,544
10,IDOD6CSV.zip,Indonesia,IDN,460


In [53]:
#add the folder name to the error df
error_df = error_df.merge(missing_data[['Survey_num', 'folder']], on='Survey_num', how='left')
error_df

Unnamed: 0,file_name,Country,ISO3,Survey_num,folder
0,IDOD4A.zip,Indonesia,IDN,221,IND2002Special
1,IDOD4AFL.zip,Indonesia,IDN,221,IND2002Special
2,IDOD4ASD.zip,Indonesia,IDN,221,IND2002Special
3,IDOD4ASV.zip,Indonesia,IDN,221,IND2002Special
4,IDOD5AFL.zip,Indonesia,IDN,335,IND2007Special
...,...,...,...,...,...
501,UZML49SD.zip,Uzbekistan,UZB,213,UZB2002Special
502,UZML49SV.zip,Uzbekistan,UZB,213,UZB2002Special
503,UZML49SV.zip,Uzbekistan,UZB,213,UZB2002Special
504,UZML49SV.zip,Uzbekistan,UZB,213,UZB2002Special


## Check Error Log 2 for Remaining Download Errors

In [58]:
# read in download log 
error2 = parse_error_log("DHS_Error_logs/error_log2_DHS.txt")
error2.head()

Unnamed: 0,URL,Response Code,Status,Error
0,https://dhsprogram.com/customcf/legacy/data/do...,200,Success,
1,https://dhsprogram.com/customcf/legacy/data/do...,200,Success,
2,https://dhsprogram.com/customcf/legacy/data/do...,200,Success,
3,https://dhsprogram.com/customcf/legacy/data/do...,200,Success,
4,https://dhsprogram.com/customcf/legacy/data/do...,200,Success,


In [61]:
#check for failed downloads
error2[error2['Status'] == 'Failure']

Unnamed: 0,URL,Response Code,Status,Error


There are no more download errors, so all past DHS data has been downloaded successfully.

# Quit Driver

In [62]:
driver.quit()