### Get Scripts From ICPSR

In [None]:
import os
import getpass
import time
import requests
import pandas as pd
import zipfile
import shutil
import retrying
from retrying import retry
import concurrent.futures
from tqdm import tqdm
from zipfile import ZipFile
from bs4 import BeautifulSoup

In [None]:
def icpsr_download(file_id, email=None, password=None, reset=False, download_dir="icpsr_data", msg=True):
    # Detect login info
    if reset:
        email = password = None
    
    if email is None:
        email = os.getenv("icpsr_email")
        if not email:
            email = input("ICPSR requires your user account information. Please enter your email address:\n")
    
    if password is None:
        password = os.getenv("icpsr_password")
        if not password:
            password = getpass.getpass("Please enter your ICPSR password:\n")
    
    # Get list of current download directory contents
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    
    # Loop through files
    for item in file_id:
        # show process
        if msg:
            print(f"Downloading ICPSR file: {item} ({str(time.time())})")
        
        # build url
        url = f"https://www.openicpsr.org/openicpsr/project/{item}/version/V1/download/project?dirPath=/openicpsr/{item}/fcr:versions/V1"
        # print(url)
        # Set up session
        with requests.Session() as session:
            r = session.get('https://login.icpsr.umich.edu/realms/icpsr/protocol/openid-connect/auth?client_id=openicpsr-web-prod&response_type=code&login=true&redirect_uri=https://www.openicpsr.org/openicpsr/oauth/callback')
            soup = BeautifulSoup(r.content)
            login_url = soup.find('form', id='kc-form-login').get('action')
            # print(login_url)
            # Login
            login_data = {'username': email, 'password': password}
            r = session.post(login_url, data=login_data)
            # Download the file
            response = session.get(url)
        
        # Save the file
        file_name = f"ICPSR_{str(item).zfill(5)}.zip"
        file_path = os.path.join(download_dir, file_name)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        
        # Wait for the download to complete (adjust the sleep time if needed)
        time.sleep(10)

In [None]:
# Open the file for reading
with open("icpsr_creds.txt", "r") as file:
    # Read the lines of the file
    lines = file.readlines()

# Process each line
for line in lines:
    # Split the line into username and password using comma as the separator
    email, password = line.strip().split(',')

In [None]:
icpsr = pd.read_csv("../data/aea_icpsr_repos.csv")
icpsr

# Let's only download if a version doesn't exist

for file_id in icpsr["ID"][:100]:
    file_name = f"ICPSR_{str(file_id).zfill(5)}.zip"
    
    file_path = os.path.join("icpsr_data", file_name)

    # Check if the file already exists
    if os.path.exists(file_path):
        continue
    else:
        icpsr_download([file_id], email=username, password=password)

In [None]:
# unzip_and_filter('icpsr_data/ICPSR_193216.zip', 'icpsr_data/')
#icpsr_download([193216], unzip=True, email=username, password=password, delete_zip=True)

download_dir = "icpsr_data/"

for file_id in icpsr["ID"][0:10]:
    file_name = f"ICPSR_{str(file_id).zfill(5)}"
    zip_file_path = os.path.join(download_dir, f"{file_name}.zip")
    folder_path = os.path.join(download_dir, file_name)

    # Check if the zip file or folder already exists
    if os.path.exists(zip_file_path) or os.path.exists(folder_path):
        print(f"File or folder {file_name} already exists. Skipping download.")

    else:
        icpsr_download([file_id], 
                   email=email, 
                   password=password, 
                   download_dir=download_dir)
        print(f"File {file_name} downloaded successfully.")

In [None]:
# Function to download a single file
def download_file(file_id, email, password, download_dir):
    file_name = f"ICPSR_{str(file_id).zfill(5)}"
    zip_file_path = os.path.join(download_dir, f"{file_name}.zip")
    folder_path = os.path.join(download_dir, file_name)

    # Check if the zip file or folder already exists
    if os.path.exists(zip_file_path) or os.path.exists(folder_path):
        return f"File or folder {file_name} already exists. Skipping download."

    else:
        icpsr_download([file_id], 
                   email=email, 
                   password=password, 
                   download_dir=download_dir)
        return f"File {file_name} downloaded successfully."

# Your main loop
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Pass the required arguments to the download_file function using functools.partial
    futures = [executor.submit(download_file, 
                               file_id, 
                               email, 
                               password,  
                               "icpsr_data") for file_id in icpsr["ID"]]

    # Wait for all the futures to complete
    concurrent.futures.wait(futures)

    # Print the results
    for future in futures:
        print(future.result())

# Delete fake downloads
find ./ -size 9990c -delete
find ./ -size 0c -delete
find ./ -size 65002c -delete
find ./ -size 22c -delete
find ./ -size 672c -delete