In [8]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
import json
import os


from huggingface_hub import list_repo_tree
try:
    files = list_repo_tree("BAAI/TACO", repo_type="dataset", recursive=True)
    print(f"\nAvailable files:")
    file_list = [file.path for file in files if file.path.endswith(('.arrow', '.parquet'))]
    for file_path in file_list[:20]:  # Show first 20
        print(f"  {file_path}")
    
    # Download all arrow files
    print(f"\nDownloading {len(file_list)} files...")
    downloaded_paths = []
    for file_path in file_list:
        try:
            local_path = hf_hub_download(
                repo_id="BAAI/TACO",
                filename=file_path,
                repo_type="dataset"
            )
            downloaded_paths.append(local_path)
            print(f"✓ Downloaded: {file_path}")
            print(f"  Location: {local_path}")
        except Exception as e:
            print(f"✗ Failed to download {file_path}: {e}")
    
    # Show summary of download locations
    print(f"\n{'='*80}")
    print("DOWNLOAD SUMMARY:")
    print(f"{'='*80}")
    print(f"Total files downloaded: {len(downloaded_paths)}")
    if downloaded_paths:
        print(f"Base location: {os.path.dirname(downloaded_paths[0])}")
        
except Exception as e:
    print(f"Error in alternative approach: {e}")

Downloading TACO dataset... This may take a few minutes.
Error downloading dataset: Dataset scripts are no longer supported, but found TACO.py

Trying alternative approach...

Available files:
  ALL/test-00000-of-00001.parquet
  ALL/train-00000-of-00009.parquet
  ALL/train-00001-of-00009.parquet
  ALL/train-00002-of-00009.parquet
  ALL/train-00003-of-00009.parquet
  ALL/train-00004-of-00009.parquet
  ALL/train-00005-of-00009.parquet
  ALL/train-00006-of-00009.parquet
  ALL/train-00007-of-00009.parquet
  ALL/train-00008-of-00009.parquet
  test/data-00000-of-00001.arrow
  train/data-00000-of-00009.arrow
  train/data-00001-of-00009.arrow
  train/data-00002-of-00009.arrow
  train/data-00003-of-00009.arrow
  train/data-00004-of-00009.arrow
  train/data-00005-of-00009.arrow
  train/data-00006-of-00009.arrow
  train/data-00007-of-00009.arrow
  train/data-00008-of-00009.arrow

Downloading 20 files...
✓ Downloaded: ALL/test-00000-of-00001.parquet
  Location: /home/sanja/.cache/huggingface/hub/d

In [56]:
import pandas as pd
import ast
import os

output_path = "taco_cleaned.csv"

# If CSV already exists, do not overwrite; skip writing
if os.path.exists(output_path):
    print(f"CSV already exists, skipping write: {os.path.abspath(output_path)}")
else:
    parquet_downloaded_paths = downloaded_paths[:10]
    header_written = False

    def has_solution(v) -> bool:
        if isinstance(v, list):
            return len(v) > 0
        try:
            parsed = ast.literal_eval(v)
            if isinstance(parsed, list):
                return len(parsed) > 0
        except Exception:
            pass
        return v not in ("[]", "", None)

    def extract_first_solution(val):
        try:
            if isinstance(val, list):
                return val[0] if val else None
            parsed = ast.literal_eval(val)
            return parsed[0] if parsed else None
        except Exception:
            return None

    for path in parquet_downloaded_paths:
        df = pd.read_parquet(path)
        df = df.drop([
            "starter_code", "name", "source", "skill_types", "url",
            "Expected Auxiliary Space", "input_output", "time_limit", "raw_tags", "date",
            "picture_num", "memory_limit", "Expected Time Complexity"
        ], axis=1, errors="ignore")

        # Filter rows with non-empty solutions and normalize to first entry
        df = df[df["solutions"].apply(has_solution)].reset_index(drop=True)
        df["solutions"] = df["solutions"].apply(extract_first_solution)

        # Write directly to CSV, appending after the first chunk
        df.to_csv(output_path, mode="a", header=not header_written, index=False)
        header_written = True

    print(f"CSV written to: {os.path.abspath(output_path)}")


CSV written to: /mnt/c/users/sanja/coded/minor/Notebooks/taco_cleaned.csv


In [55]:
df = pd.read_parquet(downloaded_paths[0])
df.head()

Unnamed: 0,question,solutions,starter_code,input_output,difficulty,raw_tags,name,source,tags,skill_types,url,Expected Auxiliary Space,time_limit,date,picture_num,memory_limit,Expected Time Complexity
0,The city park of IT City contains n east to we...,"[""n = int(input())\ncn5 = n * (n - 1) // 2 * (...",,"{""inputs"": [""5\n"", ""6\n"", ""7\n"", ""15\n"", ""17\n...",MEDIUM,"['combinatorics', 'math']",,codeforces,"['Combinatorics', 'Mathematics']",[],https://codeforces.com/problemset/problem/630/H,,,2019-12-31,,,
1,Zookeeper is buying a carton of fruit to feed ...,"[""hist = [0] * 1000005\n\ndef solve(n, s):\n\t...",,"{""inputs"": [""4\n0110\n"", ""7\n1101001\n"", ""12\n...",VERY_HARD,"['data structures', 'binary search', 'dp', 'tw...",,codeforces,"['Sorting', 'Amortized analysis', 'Dynamic pro...","['Dynamic programming', 'Data structures', 'So...",https://codeforces.com/problemset/problem/1428/F,,2 seconds,2020-10-17,0.0,256 megabytes,
2,Sasha and Kolya decided to get drunk with Coke...,"[""from collections import deque\nMAX_A = 1000\...",,"{""inputs"": [""852 10\n668 1000 1000 1000 1000 1...",HARD,"['shortest paths', 'dfs and similar', 'graphs']",,codeforces,"['Graph algorithms', 'Graph traversal', 'Short...",[],https://codeforces.com/problemset/problem/789/E,,1.0 seconds,,,256.0 megabytes,
3,"Read problem statements in [Hindi], [Bengali],...","[""p = 10 ** 9 + 7\n\ndef power(a, n):\n\tres =...",,"{""inputs"": [""2\n2\n2 2\n4\n1 2 2 6"", ""3\n3\n1 ...",MEDIUM_HARD,"['Mathematics', 'Modular Arithmetic', 'Combina...",,codechef,"['Combinatorics', 'Mathematics']",[],https://www.codechef.com/problems/CRDGAME2,,1 seconds,2020-07-07,0.0,50000 bytes,
4,"I started this as a joke among friends, tellin...","[""from math import *\nDIGS = '0123456789ABCDEF...","def converter(n, decimals=0, base=pi):\n\t","{""fn_name"": ""converter"", ""inputs"": [[13], [10]...",MEDIUM_HARD,"['Mathematics', 'Algorithms', 'Fundamentals']",,codewars,"['Fundamentals', 'Mathematics']",[],https://www.codewars.com/kata/5509609d1dbf20a3...,,,,,,
