In [0]:
import ast
import os
import csv
import json
from pathlib import Path
import xml.etree.ElementTree as ET

In [0]:
def detect_header(file_path, delimiter=','):
    """
    Detect if CSV file has a header row
    Returns True if header is detected, False otherwise
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read first few rows
        sample = file.read(1024)
        file.seek(0)
        
        # Use csv.Sniffer to detect header
        sniffer = csv.Sniffer()
        try:
            has_header = sniffer.has_header(sample)
            return has_header
        except:
            # Fallback: check if first row looks like header
            reader = csv.reader(file, delimiter=delimiter)
            first_row = next(reader, None)
            second_row = next(reader, None)
            
            if not first_row or not second_row:
                return False
            
            # Simple heuristic: if first row has non-numeric values and second row has more numeric values
            first_numeric = sum(1 for cell in first_row if cell.replace('.', '').replace('-', '').isdigit())
            second_numeric = sum(1 for cell in second_row if cell.replace('.', '').replace('-', '').isdigit())
            
            return first_numeric < second_numeric

In [0]:
def count_rows(file_path, has_header=False, delimiter=','):
    """Count total rows in CSV file"""
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=delimiter)
        if has_header:
            next(reader)  # Skip header
        return sum(1 for row in reader)

In [0]:
def split_csv(input_file, percentages, output_dir=None, has_header=None, delimiter=','):
    """
    Split CSV file into chunks based on percentages
    
    Args:
        input_file (str): Path to input CSV file
        percentages (list): List of 4 percentages that sum to 100
        output_dir (str): Output directory (default: same as input file)
        has_header (bool): True/False/None (None = auto-detect)
        delimiter (str): CSV delimiter
    """
    
    # Validate percentages
    if len(percentages) != 4:
        raise ValueError("Must provide exactly 4 percentages")
    
    if abs(sum(percentages) - 100) > 0.01:
        raise ValueError(f"Percentages must sum to 100, got {sum(percentages)}")
    
    # Setup paths
    input_path = Path(input_file)
    if not input_path.exists():
        raise FileNotFoundError(f"Input file not found: {input_file}")
    
    if output_dir is None:
        output_dir = input_path.parent
    else:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    
    # Auto-detect header if not specified
    if has_header is None:
        has_header = detect_header(input_file, delimiter)
        print(f"Auto-detected header: {'Yes' if has_header else 'No'}")
    
    # Count total data rows
    total_rows = count_rows(input_file, has_header, delimiter)
    print(f"Total data rows: {total_rows}")
    
    # Calculate chunk sizes
    chunk_sizes = []
    for i, pct in enumerate(percentages):
        if i == 3:  # Last chunk gets remaining rows
            chunk_size = total_rows - sum(chunk_sizes)
        else:
            chunk_size = int(total_rows * pct / 100)
        chunk_sizes.append(chunk_size)
    
    print(f"Chunk sizes: {chunk_sizes}")
    
    # Read header if exists
    header_row = None
    with open(input_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=delimiter)
        if has_header:
            header_row = next(reader)
    
    # Split the file
    base_name = input_path.stem
    extension = input_path.suffix
    
    with open(input_file, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile, delimiter=delimiter)
        
        # Skip header in input
        if has_header:
            next(reader)
        
        current_chunk = 0
        current_row_count = 0
        output_file = None
        writer = None
        
        for row_num, row in enumerate(reader):
            # Open new chunk file if needed
            if current_row_count == 0:
                if output_file:
                    output_file.close()
                
                chunk_filename = f"{base_name}_chunk_{current_chunk + 1}{extension}"
                chunk_path = output_dir / chunk_filename
                output_file = open(chunk_path, 'w', newline='', encoding='utf-8')
                writer = csv.writer(output_file, delimiter=delimiter)
                
                # Write header to each chunk if original had header
                if has_header and header_row:
                    writer.writerow(header_row)
                
                print(f"Creating chunk {current_chunk + 1}: {chunk_filename}")
            
            # Write row to current chunk
            writer.writerow(row)
            current_row_count += 1
            
            # Check if current chunk is complete
            if current_row_count >= chunk_sizes[current_chunk]:
                current_row_count = 0
                current_chunk += 1
                
                # Stop if we've created all 4 chunks
                if current_chunk >= 4:
                    break
        
        # Close last file
        if output_file:
            output_file.close()
    
    print(f"\nSplit complete! Created {min(current_chunk + 1, 4)} chunks in {output_dir}")
    
    # Print summary
    for i in range(min(current_chunk + 1, 4)):
        chunk_filename = f"{base_name}_chunk_{i + 1}{extension}"
        chunk_path = output_dir / chunk_filename
        if chunk_path.exists():
            chunk_rows = count_rows(str(chunk_path), has_header, delimiter)
            actual_pct = (chunk_rows / total_rows) * 100 if total_rows > 0 else 0
            print(f"  {chunk_filename}: {chunk_rows} rows ({actual_pct:.1f}%)")


In [0]:
def csv_to_json(csv_file, json_file):
    try:
        with open(csv_file, 'r') as f:
            reader = csv.DictReader(f)
            data = list(reader)
            
        if not data:
            print("No data in the file")
            return
            
        with open(json_file, 'w') as f:
            json.dump(data, f, indent=2)
            
        print(f"Successfully converted {csv_file} to {json_file}")
        
    except FileNotFoundError:
        print(f"Error: File {csv_file} not found")
    except Exception as e:
        print(f"Error: {e}")

In [0]:
def csv_to_xml(csv_file, xml_file):
    try:
        with open(csv_file, 'r') as f:
            reader = csv.DictReader(f)
            data = list(reader)
            
        if not data:
            print("No data in the file")
            return
            
        root = ET.Element("data")
        
        for row in data:
            record = ET.SubElement(root, "record")
            for key, value in row.items():
                elem = ET.SubElement(record, key)
                elem.text = value
                
        ET.indent(root, space="  ")
        tree = ET.ElementTree(root)
        tree.write(xml_file, encoding='utf-8', xml_declaration=True)
        
        print(f"Successfully converted {csv_file} to {xml_file}")
        
    except FileNotFoundError:
        print(f"Error: File {csv_file} not found")
    except Exception as e:
        print(f"Error: {e}")