In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go

# Set plot styles
sns.set_theme(style="whitegrid")  # Using sns.set_theme() instead of plt.style.use('seaborn')
%matplotlib inline

# For displaying all columns in pandas dataframes
pd.set_option('display.max_columns', None)


In [1]:
# Define functions to load CSV data from datasets/ folder and its subfolders
import csv

def detect_delimiter(file_path, encoding='latin-1', bytes_to_read=4096):
    """
    Detect the delimiter used in a CSV file.
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file
    encoding : str, default='latin-1'
        Encoding to use for reading the file
    bytes_to_read : int, default=4096
        Number of bytes to read for detection
        
    Returns:
    --------
    str
        Detected delimiter character
    """
    try:
        with open(file_path, 'r', encoding=encoding) as csvfile:
            header = csvfile.readline()
            # Common delimiters to check
            for delimiter in [',', ';', '\t', '|']:
                if delimiter in header:
                    return delimiter
        return ','  # Default to comma if nothing found
    except Exception as e:
        print(f"Error detecting delimiter in {file_path}: {e}")
        return ','  # Default to comma on error

def load_csv_data(file_path, encoding='latin-1', **kwargs):
    """
    Load a CSV file into a pandas DataFrame with encoding handling and delimiter detection.
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file
    encoding : str, default='latin-1'
        Encoding to use for reading the CSV file
    **kwargs : 
        Additional arguments to pass to pd.read_csv()
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the CSV data
    """
    try:
        # First try with delimiter detection and specified encoding
        detected_delimiter = detect_delimiter(file_path, encoding=encoding)
        print(f"Detected delimiter: '{detected_delimiter}' for {file_path}")
        
        # Try with detected delimiter
        df = pd.read_csv(file_path, encoding=encoding, delimiter=detected_delimiter, 
                         quoting=csv.QUOTE_MINIMAL, **kwargs)
        print(f"Successfully loaded {file_path} with {encoding} encoding and '{detected_delimiter}' delimiter")
        return df
    except UnicodeDecodeError:
        # If encoding fails, try with cp1252 (common in Brazil/Portugal)
        try:
            detected_delimiter = detect_delimiter(file_path, encoding='cp1252')
            df = pd.read_csv(file_path, encoding='cp1252', delimiter=detected_delimiter, 
                             quoting=csv.QUOTE_MINIMAL, **kwargs)
            print(f"Successfully loaded {file_path} with cp1252 encoding and '{detected_delimiter}' delimiter")
            return df
        except UnicodeDecodeError:
            # Last resort: use utf-8 with error replacement
            print(f"Warning: Encoding issues with {file_path}, using utf-8 with replacement characters")
            detected_delimiter = detect_delimiter(file_path, encoding='utf-8', errors='replace')
            df = pd.read_csv(file_path, encoding='utf-8', errors='replace', 
                             delimiter=detected_delimiter, quoting=csv.QUOTE_MINIMAL, **kwargs)
            return df
    except Exception as e:
        print(f"Error loading {file_path} with detected delimiter: {e}")
        # Try with more flexible parsing options
        try:
            # For newer pandas versions
            df = pd.read_csv(file_path, encoding=encoding, delimiter=';', 
                             on_bad_lines='skip', **kwargs)
            print(f"Successfully loaded {file_path} with errors skipped (using ';' delimiter)")
            return df
        except Exception as e1:
            try:
                # For older pandas versions
                df = pd.read_csv(file_path, encoding=encoding, delimiter=';', 
                                 error_bad_lines=False, warn_bad_lines=True, **kwargs)
                print(f"Successfully loaded {file_path} with errors skipped (using ';' delimiter)")
                return df
            except Exception as e2:
                print(f"All attempts to load {file_path} failed: {e2}")
                return None

def list_available_datasets(directory="/home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets", recursive=True):
    """
    List all CSV files available in the specified directory and its subfolders.
    
    Parameters:
    -----------
    directory : str, default="/home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets"
        Directory to search for CSV files
    recursive : bool, default=True
        Whether to search recursively in subfolders
        
    Returns:
    --------
    list
        List of CSV file paths
    """
    import os
    
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        return []
    
    csv_files = []
    
    if recursive:
        # Walk through all subdirectories
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith('.csv'):
                    csv_files.append(os.path.join(root, file))
    else:
        # Only look in the top directory
        csv_files = [os.path.join(directory, f) for f in os.listdir(directory) 
                    if f.endswith('.csv') and os.path.isfile(os.path.join(directory, f))]
    
    if not csv_files:
        print(f"No CSV files found in {directory}" + (" and its subfolders" if recursive else ""))
    else:
        print(f"Found {len(csv_files)} CSV files in {directory}" + (" and its subfolders" if recursive else "") + ":")
        for file in csv_files:
            print(f"  - {file}")
    
    return csv_files

def load_all_datasets(directory="/home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets", recursive=True, encoding='latin-1', **kwargs):
    """
    Load all CSV files from a directory and its subfolders into a dictionary of DataFrames.
    
    Parameters:
    -----------
    directory : str, default="/home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets"
        Directory containing CSV files
    recursive : bool, default=True
        Whether to search recursively in subfolders
    encoding : str, default='latin-1'
        Encoding to use for reading CSV files
    **kwargs : 
        Additional arguments to pass to pd.read_csv()
        
    Returns:
    --------
    dict
        Dictionary with organized structure of DataFrames
    """
    import os
    
    csv_files = list_available_datasets(directory, recursive)
    dataframes = {}
    
    for file_path in csv_files:
        # Get relative path from the base directory
        rel_path = os.path.relpath(file_path, directory)
        # Split path components
        path_components = rel_path.split(os.sep)
        
        # Create nested dictionary structure based on folder hierarchy
        current_level = dataframes
        for component in path_components[:-1]:  # Navigate through folders
            if component not in current_level:
                current_level[component] = {}
            current_level = current_level[component]
        
        # Add the DataFrame at the appropriate level
        file_name = os.path.basename(file_path).replace('.csv', '')
        current_level[file_name] = load_csv_data(file_path, encoding=encoding, **kwargs)
    
    return dataframes

def get_dataset(path, base_dir="/home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets", encoding='latin-1', **kwargs):
    """
    Load a specific CSV file by providing its relative path from the base directory.
    
    Parameters:
    -----------
    path : str
        Relative path to the CSV file from base_dir (e.g., 'subfolder/file.csv')
    base_dir : str, default="/home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets"
        Base directory containing datasets
    encoding : str, default='latin-1'
        Encoding to use for reading the CSV file
    **kwargs : 
        Additional arguments to pass to pd.read_csv()
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the CSV data
    """
    import os
    
    full_path = os.path.join(base_dir, path)
    if not full_path.endswith('.csv'):
        full_path += '.csv'
    
    return load_csv_data(full_path, encoding=encoding, **kwargs)


In [2]:
# Example usage:
# available_files = list_available_datasets()
# data_dict = load_all_datasets()
# Or load a specific file:
df = load_csv_data("datasets/datatran2024.csv")

Found 12 CSV files in /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets and its subfolders:
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/acidentes2024_todas_causas_tipos/acidentes2024_todas_causas_tipos.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/acidentes2024/acidentes2024.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/datatran2024/datatran2024.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/infrações_2024/ajustados_2024/infrações2024_01.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/infrações_2024/ajustados_2024/infrações2024_02.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/infrações_2024/ajustados_2024/infrações2024_03.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/infrações_2024/ajustados_2024/infrações2024_04.csv
  - /home/hpfeffer/development/poc-rhoai/synth-data-sdv/datasets/infrações_2024/ajustados_2024/infrações20

In [8]:
data_dict

{'acidentes2024_todas_causas_tipos': {'acidentes2024_todas_causas_tipos': None},
 'acidentes2024': {'acidentes2024': None},
 'datatran2024': {'datatran2024': None},
 'infrações_2024': {'ajustados_2024': {'infrações2024_01': None,
   'infrações2024_02': None,
   'infrações2024_03': None,
   'infrações2024_04': None,
   'infrações2024_05': None,
   'infrações2024_06': None,
   'infrações2024_07': None,
   'infrações2024_08': None,
   'infrações2024_09': None}}}

In [10]:
data_dict['datatran2024']

SyntaxError: invalid syntax (2269211276.py, line 1)