### Unzip and Only Keep Scripts

In [1]:
import os
import glob 
import os
import zipfile
import shutil
import logging

logging.basicConfig(filename='unzip_log.txt', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
def is_potential_zip(file_path):
    with open(file_path, 'rb') as f:
        header = f.read(2)  # Read the first two bytes
        return header == b'PK'

def unzip_and_move_files(folder_path, target_folder):
    """Unzips files in a folder, deletes non .r/.py files, and moves .r/.py files.

    Args:
        folder_path (str): Path to the folder containing zipped files.
        target_folder (str): Path to the target folder where files will be unzipped.
    """

    for filename in os.listdir(folder_path):
        if filename.endswith(".zip"):
            zip_path = os.path.join(folder_path, filename)
            base_name = os.path.splitext(filename)[0]
            output_dir = os.path.join(target_folder, base_name)

            if base_name in os.listdir(target_folder):
                logging.info(f"File {base_name} already exists in {target_folder}. Skipping extraction for {filename}")
                continue
                
            if not is_potential_zip(zip_path):
                logging.warning(f"File {filename} may not be a valid zip file. Skipping.")
                continue 
                
            logging.info(f"Processing file: {filename}")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                try:
                    zip_ref.extractall(output_dir)
                except zipfile.BadZipFile:
                    logging.error(f"Error: Bad zip file: {filename}")
                    continue

            # Delete non .r/.py files 
            for root, dirs, files in os.walk(output_dir):
                non_code_files = glob.glob(os.path.join(root, "*"))
                non_code_files = [f for f in non_code_files if not f.lower().endswith((".r", ".py"))]

                for file in non_code_files:
                    try:
                        os.remove(file)
                        logging.info(f"Deleted non .r/.py file: {file}")     
                    except OSError as e:
                        logging.error(f"Error deleting file {file}: {e}")

            # Move .r and .py files
            for root, dirs, files in os.walk(output_dir):
                for file in files:
                    if file.lower().endswith((".r", ".py")):
                        src_path = os.path.join(root, file)
                        dest_path = os.path.join(output_dir, file)  

                        try:
                            shutil.move(src_path, dest_path) 
                        except shutil.Error as e:
                            if "Destination path" in str(e): 
                                logging.warning(f"Skipping duplicate file: {file}")
                            else:
                                logging.error(f"Error moving file: {file}", exc_info=True)

            # Remove empty subfolders (if desired)
            for root, dirs, files in os.walk(output_dir):
                for dir in dirs:
                    shutil.rmtree(os.path.join(root, dir))

In [None]:
folder_path = "icpsr_data/" 
target_folder = "icpsr_files/"
unzip_and_move_files(folder_path, target_folder)

# find target_folder/ -name "*.R" -o -name "*.py" | wc -l

find icpsr_files \( -iname "*.r" -o -iname "*.py" \) | wc -l
ls -lSrh | tail -n 10 | awk '{print $9}'
ls -lSrh | tail -n 10 | awk '{print $9}' | xargs -p rm