In [3]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
import json
import os


from huggingface_hub import list_repo_tree
try:
    files = list_repo_tree("BAAI/TACO", repo_type="dataset", recursive=True)
    print(f"\nAvailable files:")
    file_list = [file.path for file in files if file.path.endswith(('.arrow', '.parquet'))]
    for file_path in file_list[:20]:  # Show first 20
        print(f"  {file_path}")
    
    # Download all arrow files
    print(f"\nDownloading {len(file_list)} files...")
    downloaded_paths = []
    for file_path in file_list:
        try:
            local_path = hf_hub_download(
                repo_id="BAAI/TACO",
                filename=file_path,
                repo_type="dataset"
            )
            downloaded_paths.append(local_path)
            print(f"✓ Downloaded: {file_path}")
            print(f"  Location: {local_path}")
        except Exception as e:
            print(f"✗ Failed to download {file_path}: {e}")
    
    # Show summary of download locations
    print(f"\n{'='*80}")
    print("DOWNLOAD SUMMARY:")
    print(f"{'='*80}")
    print(f"Total files downloaded: {len(downloaded_paths)}")
    if downloaded_paths:
        print(f"Base location: {os.path.dirname(downloaded_paths[0])}")
        
except Exception as e:
    print(f"Error in alternative approach: {e}")

  from .autonotebook import tqdm as notebook_tqdm



Available files:
  ALL/test-00000-of-00001.parquet
  ALL/train-00000-of-00009.parquet
  ALL/train-00001-of-00009.parquet
  ALL/train-00002-of-00009.parquet
  ALL/train-00003-of-00009.parquet
  ALL/train-00004-of-00009.parquet
  ALL/train-00005-of-00009.parquet
  ALL/train-00006-of-00009.parquet
  ALL/train-00007-of-00009.parquet
  ALL/train-00008-of-00009.parquet
  test/data-00000-of-00001.arrow
  train/data-00000-of-00009.arrow
  train/data-00001-of-00009.arrow
  train/data-00002-of-00009.arrow
  train/data-00003-of-00009.arrow
  train/data-00004-of-00009.arrow
  train/data-00005-of-00009.arrow
  train/data-00006-of-00009.arrow
  train/data-00007-of-00009.arrow
  train/data-00008-of-00009.arrow

Downloading 20 files...
✓ Downloaded: ALL/test-00000-of-00001.parquet
  Location: /home/dipst/.cache/huggingface/hub/datasets--BAAI--TACO/snapshots/d593ed0a2becbbc952230bb89be09189bf1056dc/ALL/test-00000-of-00001.parquet
✓ Downloaded: ALL/train-00000-of-00009.parquet
  Location: /home/dipst/.

In [6]:
import pandas as pd
import ast
import os

output_path = "taco_cleaned.csv"

# If CSV already exists, do not overwrite; skip writing
if os.path.exists(output_path):
    print(f"CSV already exists, skipping write: {os.path.abspath(output_path)}")
else:
    parquet_downloaded_paths = downloaded_paths[:10]
    header_written = False

    def has_solution(v) -> bool:
        if isinstance(v, list):
            return len(v) > 0
        try:
            parsed = ast.literal_eval(v)
            if isinstance(parsed, list):
                return len(parsed) > 0
        except Exception:
            pass
        return v not in ("[]", "", None)

    def extract_first_solution(val):
        try:
            if isinstance(val, list):
                return val[0] if val else None
            parsed = ast.literal_eval(val)
            return parsed[0] if parsed else None
        except Exception:
            return None

    for path in parquet_downloaded_paths:
        df = pd.read_parquet(path)
        df = df.drop([
            "starter_code", "name", "source", "skill_types", "url",
            "Expected Auxiliary Space", "input_output", "time_limit", "raw_tags", "date",
            "picture_num", "memory_limit", "Expected Time Complexity"
        ], axis=1, errors="ignore")

        # Filter rows with non-empty solutions and normalize to first entry
        df = df[df["solutions"].apply(has_solution)].reset_index(drop=True)
        df["solutions"] = df["solutions"].apply(extract_first_solution)

        # Write directly to CSV, appending after the first chunk
        df.to_csv(output_path, mode="a", header=not header_written, index=False)
        header_written = True

    print(f"CSV written to: {os.path.abspath(output_path)}")


CSV written to: /mnt/c/users/dipst/OneDrive/Desktop/task_specific_adaptation/Notebooks/taco_cleaned.csv


In [9]:
df=pd.read_csv("/mnt/c/users/dipst/OneDrive/Desktop/task_specific_adaptation/Notebooks/taco_cleaned.csv")

In [13]:
df=df[['question','solutions']]

In [15]:
df.shape

(20493, 2)