In [2]:
import pandas as pd
from datetime import datetime as dt, timedelta
import calendar
import requests
import os
import zipfile
import shutil



## 1. Download zip-data for platforms and months of interest

Quelle: https://transparency.dsa.ec.europa.eu/data-download?from_date=&to_date=&uuid=6d8da567-b559-49c6-8d0b-dbc8c16e5a8a

In [3]:
def get_all_dates_in_month(year, month):
    """for a given year and month, returns all dates in that month"""
    _, num_days = calendar.monthrange(year, month)
    return [dt(year, month, day).strftime('%Y-%m-%d') for day in range(1, num_days + 1)]

In [4]:
def fetch_daily_reports(date=dt.today().strftime('%Y-%m-%d'), platform="Instagram", output_path="", light=False):
    """for a given date (default: today) and a given platform (default: Instagram), fetch the zipped daily reports from the dsa transparency database"""

    base_url = 'https://dsa-sor-data-dumps.s3.eu-central-1.amazonaws.com'
    url = base_url + f'/sor-{platform.lower()}-{date}{"-light" if light else "-full"}.zip'
    response = requests.get(url)

    if response.status_code == 200:
        save_path = output_path+f'sor-{platform}-{date}{"-light" if light else ""}.zip'
        with open(save_path, 'wb') as f:
            f.write(response.content)
    else:
        return f"Failed to retrieve data for {date}. Status code: {response.status_code}"

In [7]:
platforms = ["Facebook", "X", "Tiktok", "Youtube", "Linkedin", "Instagram"]
months = [10, 11, 12]
year = 2024
my_path = "../data/raw/zipped/"

In [8]:
for month in months:
    for date in get_all_dates_in_month(year, month):
        for platform in platforms:
            fetch_daily_reports(date=date, platform=platform, output_path=my_path, light=True)

KeyboardInterrupt: 

## 2. Unzip the data, merge the csvs

A report about a content-moderations is calles statement of reason. The platforms publish these statements of reasons on a daily basis. For each day, they publish a zip file that contains several layers of sub-zip-folders. Those contain several csv files each. Many of the csv files are empty, some contain the data.

We first download all zip files for given platforms and months. Then we unzip the folder and merge the csv files. It is important to delete the original folders during this process. Otherwise the filesystem will be full quickly.

In [1]:
def unzip_nested_zip(zip_file, extract_to):
    """unzips a (nested) zip file, stores all contained csvs in an array and returns it"""
    csv_files = []
    with zipfile.ZipFile(zip_file, 'r') as z:
        z.extractall(extract_to)
        for file_name in z.namelist():
            file_path = os.path.join(extract_to, file_name)
            if zipfile.is_zipfile(file_path):
                nested_extract_to = os.path.join(extract_to, os.path.splitext(file_name)[0])
                os.makedirs(nested_extract_to, exist_ok=True)
                csv_files.extend(unzip_nested_zip(file_path, nested_extract_to))
            elif file_name.endswith('.csv'):
                csv_files.append(file_path)
    return csv_files

In [36]:
def merge_csv_files(csv_files, output_csv_path):
    """given an array of csvs, merges all csv files into one and saves it to the output path"""
    df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_csv(output_csv_path, index=False)

In [37]:
def process_zip(zip_file_path, output_folder):
    """processes a zip file by unzipping it, merging all csvs, saving the merged csvs to the output folder, and deleting the original files"""
    zip_name = os.path.splitext(os.path.basename(zip_file_path))[0]
    extracted_folder = os.path.join(output_folder, 'extracted', zip_name) #tmp folder to store extracted files
    os.makedirs(extracted_folder, exist_ok=True)
    csv_files = unzip_nested_zip(zip_file_path, extracted_folder)
    output_csv_path = os.path.join(output_folder, f'{zip_name}.csv')
    merge_csv_files(csv_files, output_csv_path)
    #check: landen die Daten im Papierkorb? Wenn ja was anderes als shutil nutzen, oder als Parameter mitgeben
    #oder besser über os package
    shutil.rmtree(extracted_folder, ignore_errors=True) #clean up extracted folder

In [38]:
def process_all_zips_in_folder(zip_folder, output_folder):
    """processes all zip files in a given folder, unzips them on several layers, """
    os.makedirs(output_folder, exist_ok=True)
    for root, _, files in os.walk(zip_folder):
        for file in files:
            if not os.path.isfile(output_folder+"/"+file.strip('.zip')+'.csv'):
                if file.endswith('.zip'):
                    zip_file_path = os.path.join(root, file)
                    process_zip(zip_file_path, output_folder)
            else:
                print("file ",output_folder+"/"+file.strip('.zip')+'.csv', "already exists, skipping.")

In [39]:
zip_folder = '../data/raw/zipped'
output_folder = '../data/raw/merged_csv'
process_all_zips_in_folder(zip_folder, output_folder)

  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_list = [pd.read_csv(csv_file) for csv_file in csv_files]
  df_lis