# Raw to CSV

Process the raw CSV files from the I-V Curve Tracer to get 1 CSV file with only the relevant columns.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import time
from datetime import datetime 

In [3]:
base_path = Path(r"D:\Research\Data") # Base path for raw data files
output_csv = Path("unfiltered_dataset.csv") # Output CSV file path
log_file = Path("processed_files_2.txt") # Log file to keep track of processed files
error_log = Path("error_log.txt") # Log file for errors
missing_csvs = Path("missing_csv.txt") # Log missing csv filenames
print_every = 2000 # Print progress every 2000 files

### Identified columns on CSV

In [3]:
base_row1_column_names = [
    'date', 'time_start', 'voltage_0', 'current_0', 'power_0', 'wavelength_0', 'spectralirr_0',
    'modtemp_c', 'modtemp_l', 'cell_v', 'irr_horiz_start', 'irr_incl20_start', 'airtemp',
    'humidity_rel', 'pressure_rel', 'air_density', 'wind_speed_kmh',
    'wind_dir', 'humidity_abs', 'pressure_abs', 'wind_speed_ms', 'irr_east_start',
    'irr_west_start', 'irr_floor_ref_start'
]

full_row1_column_names = base_row1_column_names + ['irr_diffuse_start', 'irr_incl15_start']

base_row2_column_names = [
    'module_name', 'time_end', 'voltage_1', 'current_1',
    'power_1', 'wavelength_1', 'spectralirr_1', 'cell_v_end',
    'irr_horiz_end', 'irr_incl20_end', 'irr_east_end', 'irr_west_end',
    'irr_floor_ref_end']

full_row2_column_names = base_row2_column_names + ['irr_diffuse_end', 'irr_incl15_end']

### Error log

This will log errors into a file with the corresponding exception.

In [9]:
def log_error(filename, message, exception=None):
    if exception:
        full_msg = f"{filename} — {message}: {repr(exception)}"
    else:
        full_msg = f"{filename} — {message}"

    # Print to console
    print("❌", full_msg)

    # Append to log file
    with open(error_log, "a", encoding="utf-8") as err_f:
        err_f.write(full_msg + "\n")

# Processing functions

In [5]:
def process_file(file_path):
    filename = file_path.name

    # --- Robust Read Logic from V3 Script ---
    opts = dict(sep=';', header=None, na_values='--', engine='python')
    try:
        # Try default UTF-8 first
        df = pd.read_csv(file_path, encoding='utf-8', encoding_errors='strict', **opts)
    except UnicodeDecodeError:
        # Fallback to latin1 if UTF-8 fails (handles the \xe3 byte)
        try:
            df = pd.read_csv(file_path, encoding='latin1', encoding_errors='replace', **opts)
        except Exception as e:
            log_error(filename, "Error reading CSV even with fallback", e)
            return None
    except Exception as e:
        log_error(filename, "Error reading CSV", e)
        return None

    # Check number of columns
    num_cols = df.shape[1]
    if num_cols == 26:
        current_row1_names = full_row1_column_names
        current_row2_names = full_row2_column_names
    elif num_cols == 24:
        current_row1_names = base_row1_column_names
        current_row2_names = base_row2_column_names
    else:
        log_error(filename, f"Unexpected column count: {num_cols}", None)
        return None
    
    # Parse Row 1
    try:
        row1_list = df.iloc[0, :].dropna().tolist()
        # Check if dropna produced the correct length
        if len(row1_list) == len(current_row1_names):
            df_row1 = pd.DataFrame([row1_list], columns=current_row1_names)
        else:
            # Fallback for Row 1
            df_row1 = pd.DataFrame([df.iloc[0, :len(current_row1_names)].tolist()], columns=current_row1_names)
        row1 = df_row1.iloc[0]
    except Exception as e:
        log_error(filename, "Error parsing Row 1 metadata", e)
        return None

    # Parse Row 2
    try:
        row2_list = df.iloc[1, :].dropna().tolist()
        # Check if dropna produced the correct length (13 or 15)
        if len(row2_list) == len(current_row2_names):
            df_row2 = pd.DataFrame([row2_list], columns=current_row2_names)
        else:
            # Fallback for Row 2: Explicitly pull indices to handle the gaps
            row2_idxs = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 21, 22, 23]
            if num_cols == 26:
                row2_idxs += [24, 25] # Add Diffuse and GTI 15 for 26-col files
            
            df_row2 = pd.DataFrame([df.iloc[1, row2_idxs].tolist()], columns=current_row2_names)
        row2 = df_row2.iloc[0]
    except Exception as e:
        log_error(filename, "Error parsing Row 2 metadata", e)
        return None

    try:
        # Generate timestamp from filename
        name = filename.replace('.csv', '')
        # Split by underscore
        parts = name.split('_')
        # Extract date/time parts (skip module name)
        day = int(parts[1])
        month = int(parts[2])
        year = int(parts[3])
        hour = int(parts[4])
        minute = int(parts[5])
        second_str = parts[6].split(' ')[0]
        second = int(second_str)
        
        timestamp = datetime(year, month, day, hour, minute, second)
        
        module_name = row2["module_name"]
    except Exception as e:
        log_error(filename, "Missing metadata fields", e)
        return None
    
    # Compute Spectral Integral
    try:
        wavelength = df.iloc[:2048, 5].copy() # Columns 5 is wavelength (nm) 
        spectral_irradiance = df.iloc[:2048, 6].copy() # Column 6 is spectral irradiance (W/m2/nm)
        computed_irradiance = np.trapezoid(spectral_irradiance, x=wavelength)
    except Exception as e:
        computed_irradiance = np.nan
        # log_error(filename, "Error computing spectral integral", e)
        # return None

    # Get Irradiances
    try:
        # Start irradiances (from row1)
        irr_horiz_start = row1["irr_horiz_start"]
        irr_incl20_start = row1["irr_incl20_start"]
        irr_incl15_start = row1.get("irr_incl15_start", np.nan)
        irr_east_start = row1["irr_east_start"]
        irr_west_start = row1["irr_west_start"]
        irr_floor_ref_start = row1["irr_floor_ref_start"]
        
        # End irradiances (from row2)
        irr_horiz_end = row2["irr_horiz_end"]
        irr_incl20_end = row2["irr_incl20_end"]
        irr_incl15_end = row2.get("irr_incl15_end", np.nan)
        irr_east_end = row2["irr_east_end"]
        irr_west_end = row2["irr_west_end"]
        irr_floor_ref_end = row2["irr_floor_ref_end"]
    except Exception as e:
        log_error(filename, "Missing irradiance data", e)
        return None

    # Get temperature
    try:
        modtemp_c = row1["modtemp_c"]
        modtemp_l = row1["modtemp_l"]
    except Exception as e:
        log_error(filename, "Error reading module temperatures", e)
        return None

    # I-V & P-V Curve validation
    try: 
        measurements = df.iloc[:, [2, 3, 4]].copy()
        measurements.columns = ['voltage', 'current', 'power']
        # Discard the first values (start from voltage close to 0)
        min_idx = measurements['voltage'].idxmin()
        measurements = measurements.iloc[min_idx:].reset_index(drop=True)

    except Exception as e:
        log_error(filename, "Error processing I-V curve", e)
        return None

    try:
        V = measurements["voltage"]
        I = measurements["current"]
        P = measurements["power"]

        Voc = V.max()
        Pmpp = P.max()
        mpp_idx = P.idxmax()
        Vmpp = V[mpp_idx]
        Impp = I[mpp_idx]
        Imin = I.min()
        
    except Exception as e:
        log_error(filename, "Error computing electrical parameters", e)
        return None

    # Delete tail
    try:
        d = np.mean(np.abs(np.diff(I[:200])))
        diffs_20 = np.abs(np.diff(I[:20]))
        # Compare diffs20 to d
        idxs_cola = [i for i, diff in enumerate(diffs_20) if diff > d]
        if idxs_cola:
            corte_inicial = idxs_cola[-1] + 1
            measurements = measurements.iloc[corte_inicial:].reset_index(drop=True)
    except Exception as e:
        log_error(filename, "Error cleaning up cola noise", e)

    # Find Isc using linear regression on the initial segment (V < Voc/4)
    try:
        # Define the threshold for the initial "flat" segment of the curve
        # Using the Voc previously computed in your function
        v_limit = Voc / 4 
        
        # Select points below the threshold
        mask_ini = (measurements["voltage"] < v_limit)
        
        # Ensure we have enough points (at least 3) to perform a stable regression
        if mask_ini.sum() >= 3:
            vx = measurements.loc[mask_ini, "voltage"].values
            iy = measurements.loc[mask_ini, "current"].values
            
            # Perform Linear Regression: I = a*V + b
            # A becomes a matrix of [Voltage, 1]
            A = np.vstack([vx, np.ones_like(vx)]).T
            a, b = np.linalg.lstsq(A, iy, rcond=None)[0]
            
            # The intercept 'b' is the estimated current at V=0 (Isc)
            Isc = float(b)
        else:
            # Fallback for very sparse curves: take the current closest to V=0
            ordered_meas = measurements.sort_values(by="voltage")
            Isc = float(ordered_meas["current"].iloc[0])
            
    except Exception as e:
        log_error(filename, "Error estimating Isc via linear regression", e)
        return None
    
    # Build final dataframe
    try:
        final_df = pd.DataFrame({
            # Metadata
            'filename': [filename],
            'module_name': [module_name],
            'timestamp': [timestamp],

            # MPP
            'Vmpp': [Vmpp],
            'Impp': [Impp],
            'Pmpp': [Pmpp],

            # Electrical characteristics
            'Voc': [Voc],
            'Imin': [Imin],
            'Isc': [Isc],

            # Irradiance
            'G_spec_int': [computed_irradiance],
            'G_tilt20_start': [irr_incl20_start],
            'G_tilt15_start': [irr_incl15_start],
            'G_horiz_start': [irr_horiz_start],
            'G_east_start': [irr_east_start],
            'G_west_start': [irr_west_start],
            'G_refl_start': [irr_floor_ref_start],

            'G_horiz_end': [irr_horiz_end],
            'G_tilt20_end': [irr_incl20_end],
            'G_tilt15_end': [irr_incl15_end],
            'G_east_end': [irr_east_end],
            'G_west_end': [irr_west_end],
            'G_refl_end': [irr_floor_ref_end],

            # Environmental characteristics
            'module_temperature_center': [modtemp_c],
            'module_temperature_lateral': [modtemp_l],
            'air_temperature': [row1['airtemp']],
            'relative_humidity': [row1['humidity_rel']],
            'air_density': [row1['air_density']],
            'abs_pressure': [row1['pressure_abs']],
            'wind_speed_ms': [row1['wind_speed_ms']],
            'wind_direction': [row1['wind_dir']]
        })
    except Exception as e:
        log_error(filename, "Error creating final dataframe", e)
        return None

    return final_df

In [4]:
# Get all CSV files from 2023-2025
all_files = []
skip_prefixes = ('MS711', 'WS500', 'PV')

for year in range(2022, 2026):  # 2022-2025
    year_files = list(base_path.glob(f"{year}/**/*.csv"))
    # Filter out unwanted prefixes
    year_files = [f for f in year_files if not f.name.startswith(skip_prefixes)]
    all_files.extend(year_files)
    print(f"Found {len(year_files)} files in {year}")

print(f"\nTotal: {len(all_files)} CSV files from 2022-2025 (excluding MS711, WS500, PV)")

# Load already processed files
processed_files = set()
if log_file.exists():
    with open(log_file, 'r') as f:
        processed_files = set(line.strip() for line in f)
    print(f"Already processed: {len(processed_files)} files")

# Filter to only new files
files_to_process = [f for f in all_files if f.name not in processed_files]
total_files = len(files_to_process)

print(f"Files to process: {total_files}")

# Determine if we need header
write_header = not output_csv.exists() or output_csv.stat().st_size == 0

Found 419197 files in 2022
Found 425151 files in 2023
Found 440615 files in 2024
Found 424753 files in 2025

Total: 1709716 CSV files from 2022-2025 (excluding MS711, WS500, PV)
Files to process: 1709716


# Process Loop

In [None]:
processed_count = 0
skipped_due_to_error = 0
start_time = time.time()

with open(output_csv, 'a', newline='', encoding='utf-8') as output_f:
    for i, file_path in enumerate(files_to_process):  # ✅ Use files_to_process
        filename = file_path.name  # ✅ Just filename for consistency
        
        try:
            result_df = process_file(file_path)
        except Exception as e:
            print(f"Unhandled error on file: {filename}")
            skipped_due_to_error += 1
            continue

        if result_df is None:
            skipped_due_to_error += 1
            continue

        result_df.to_csv(output_f, header=write_header, index=False)
        write_header = False

        with open(log_file, 'a') as log_f:
            log_f.write(filename + '\n')  

        processed_count += 1

        # Progress print
        if processed_count % print_every == 0 or processed_count == 1:
            elapsed = time.time() - start_time
            avg_time = elapsed / processed_count
            remaining = avg_time * (total_files - processed_count)
            print(
                f"[{processed_count}/{total_files}] processed "
                f"({processed_count/total_files * 100:.1f}%) "
                f"— Skipped: {skipped_due_to_error} "
                f"— ETA: {remaining / 60:.1f} min"
            )


# Final summary
print(f"\n✅ Extraction complete. Processed: {processed_count}, Skipped due to error: {skipped_due_to_error}")

❌ CDF1150A1_26_04_2022_13_11_29.csv — Unexpected column count: 18
❌ CDF1150A1_26_04_2022_13_16_29.csv — Unexpected column count: 18
❌ DUSTCS6K270P_26_04_2022_13_10_01.csv — Unexpected column count: 18
❌ DUSTCS6K270P_26_04_2022_13_15_01.csv — Unexpected column count: 18
❌ GSA060_26_04_2022_13_11_07.csv — Unexpected column count: 18
❌ GSA060_26_04_2022_13_16_07.csv — Unexpected column count: 18
❌ LG345N1C_26_04_2022_13_10_45.csv — Unexpected column count: 18
❌ LG345N1C_26_04_2022_13_15_45.csv — Unexpected column count: 18
❌ LG370Q1C_26_04_2022_13_10_23.csv — Unexpected column count: 18
❌ LG370Q1C_26_04_2022_13_15_23.csv — Unexpected column count: 18
❌ _26_04_2022_12_21_51.csv — Unexpected column count: 17
❌ _26_04_2022_12_22_13.csv — Unexpected column count: 17
❌ _26_04_2022_13_11_51.csv — Unexpected column count: 17
❌ _26_04_2022_13_12_13.csv — Unexpected column count: 17
❌ _26_04_2022_13_13_05.csv — Unexpected column count: 17
❌ _10_06_2022___.csv — Unexpected column count: 7
[1/732] p

# Adding more values

In [11]:
def extract_iv(file_path):
    try:
        # Optimization: ONLY read Voltage and Current columns
        df_raw = pd.read_csv(file_path, sep=';', header=None, na_values='--', engine='python', encoding='latin1')
        num_cols = df_raw.shape[1]

        if num_cols not in [24, 26]:
            return None
        
        df_raw = df_raw.iloc[:, [2, 3]]  # Voltage and Current columns
        df_raw.columns = ['v', 'i']
        
        # Initial cleanup
        min_idx = df_raw['v'].idxmin()
        df_clean = df_raw.iloc[min_idx:].reset_index(drop=True)
        v, i = df_clean['v'], df_clean['i']
        
        if len(v) < 10: return None
        
        # 1. Basic Metrics
        V_ini, I_ini = v.iloc[0], i.iloc[0]
        Voc_obs = v.max()
        P = v * i
        Pmpp = P.max()
        mpp_idx = P.idxmax()
        Vmpp, Impp = v[mpp_idx], i[mpp_idx]
        
        # 2. Isc & Rsh (Initial Segment: V < Voc/4)
        mask_ini = v < (Voc_obs / 4)
        Isc, R2_Isc, Rsh = np.nan, np.nan, np.nan
        if mask_ini.sum() >= 3:
            vx, iy = v[mask_ini], i[mask_ini]
            A = np.vstack([vx, np.ones_like(vx)]).T
            m, b = np.linalg.lstsq(A, iy, rcond=None)[0]
            Isc = float(b)
            Rsh = 1 / abs(m) if abs(m) > 1e-9 else np.nan
            # R2 Calculation
            yhat = m*vx + b
            ss_res = np.sum((iy - yhat)**2)
            ss_tot = np.sum((iy - iy.mean())**2)
            R2_Isc = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0

        # 3. Voc & Rs (Final Segment: V > 0.9 * Voc)
        mask_fin = v > (Voc_obs * 0.9)
        Voc_extrap, R2_Voc, Rs = np.nan, np.nan, np.nan
        if mask_fin.sum() >= 3:
            vx, iy = v[mask_fin], i[mask_fin]
            A = np.vstack([vx, np.ones_like(vx)]).T
            m, b = np.linalg.lstsq(A, iy, rcond=None)[0]
            # Voc is where I=0 -> 0 = mV + b -> V = -b/m
            Voc_extrap = -b / m if abs(m) > 1e-9 else Voc_obs
            Rs = 1 / abs(m) if abs(m) > 1e-9 else np.nan
            # R2 Calculation
            yhat = m*vx + b
            ss_res = np.sum((iy - yhat)**2)
            ss_tot = np.sum((iy - iy.mean())**2)
            R2_Voc = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0

        # 4. Fill Factor
        # Using extrapolated values if available for better physics
        Isc_val = Isc if Isc > 0 else i.iloc[0]
        Voc_val = Voc_extrap if Voc_extrap > 0 else Voc_obs
        FF = Pmpp / (Isc_val * Voc_val) if (Isc_val * Voc_val) > 0 else np.nan

        return {
            'filename': file_path.name,
            'Isc': Isc, 'R2_Isc': R2_Isc, 'Rsh': Rsh,
            'Voc': Voc_extrap, 'R2_Voc': R2_Voc, 'Rs': Rs,
            'FF': FF, 'V_ini': V_ini, 'I_ini': I_ini
        }
    except:
        return None

In [12]:
import csv
from tqdm import tqdm

output_patch_csv = "iv_dna_patch.csv"
fieldnames = ['filename', 'Isc', 'R2_Isc', 'Rsh', 'Voc', 'R2_Voc', 'Rs', 'FF', 'V_ini', 'I_ini']

# Check if we need to write the header
file_exists = Path(output_patch_csv).exists()

with open(output_patch_csv, mode='a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    if not file_exists:
        writer.writeheader()

    # Using tqdm gives you a live ETA and "files per second" metric
    for file_path in tqdm(files_to_process, desc="Extracting IV-DNA"):
        res = extract_iv(file_path)
        if res:
            writer.writerow(res)
            # No need to append to a list; the data is now safe on the HDD

Extracting IV-DNA: 100%|██████████| 1709716/1709716 [7:12:10<00:00, 65.94it/s]   


In [21]:
# 1. Convert the new DNA results to a DataFrame
df_dna_patch = pd.read_csv('iv_dna_patch.csv')

# 2. Load the original master dataset
df_master = pd.read_csv('unfiltered_dataset.csv')

# 3. Drop the old/incorrect columns from the master 
# so they can be replaced by the new regression-based ones
cols_to_replace = ['Isc', 'Voc']
df_master = df_master.drop(columns=[c for c in cols_to_replace if c in df_master.columns])

# 4. Merge
# Using 'left' join ensures we keep all rows, 
# even those where the IV-DNA extraction might have failed.
df_final = pd.merge(df_master, df_dna_patch, on='filename', how='left')

# 5. Save the new curated version
df_final.to_csv('curated_dataset_v2.csv', index=False)

  df_master = pd.read_csv('unfiltered_dataset.csv')


In [23]:
df_final.head().T

Unnamed: 0,0,1,2,3,4
filename,CDF1150A1_01_01_2024_06_38_06.csv,CDF1150A1_01_01_2024_06_43_06.csv,CDF1150A1_01_01_2024_06_48_06.csv,CDF1150A1_01_01_2024_06_53_06.csv,CDF1150A1_01_01_2024_06_58_06.csv
module_name,CDF1150A1,CDF1150A1,CDF1150A1,CDF1150A1,CDF1150A1
timestamp,2024-01-01 06:38:06,2024-01-01 06:43:06,2024-01-01 06:48:06,2024-01-01 06:53:06,2024-01-01 06:58:06
Vmpp,37.232705,38.90823,39.671031,42.983558,43.609355
Impp,0.030294,0.035708,0.042457,0.051073,0.061117
Pmpp,1.127916,1.389348,1.684322,2.195303,2.66526
Imin,0.000415,0.000069,-0.000025,-0.000029,-0.001578
G_spec_int,19.115304,22.084789,25.572165,31.336795,36.211122
G_tilt20_start,31.1,35.09,39.07,45.24,50.83
G_tilt15_start,20.21,23.54,27.16,32.96,38.18


# APE and diffuse irradiance

In [9]:
# Constants
H_PLANCK = 6.626e-34  # J*s
C_LIGHT = 2.998e8     # m/s
Q_ELEM = 1.602e-19    # Coulombs
HC_OVER_Q = (H_PLANCK * C_LIGHT) / Q_ELEM  * 1e9 # in units of eV*nm

def get_APE_and_diff_irr(file_path):
    
    try:
        df_raw = pd.read_csv(file_path, sep=';', header=None, na_values='--', engine='python', encoding='latin1', on_bad_lines='skip')
    except Exception as e:
        return None
    num_cols = df_raw.shape[1]

    if num_cols not in [24, 26]:
        return None
    
    # Extract diffuse irradiance
    irr_diffuse_start = df_raw.iloc[0, 24] if num_cols == 26 else np.nan 
    irr_diffuse_end = df_raw.iloc[1, 24] if num_cols == 26 else np.nan
    
    # APE 
    
    # Compute Spectral Integral
    try:
        wavelength = df_raw.iloc[:2048, 5].copy() # Columns 5 is wavelength (nm) 
        spectral_irradiance = df_raw.iloc[:2048, 6].copy() # Column 6 is spectral irradiance (W/m2/nm)
        integral_E = np.trapezoid(spectral_irradiance, x=wavelength)

        weighted_irradiance = spectral_irradiance * wavelength
        integral_E_lambda = np.trapezoid(weighted_irradiance, x=wavelength)

        ape_val = HC_OVER_Q * (integral_E / integral_E_lambda)

    except Exception as e:
        integral_E = np.nan
        ape_val = np.nan

    return {
        'filename': file_path.name,
        'APE': ape_val,
        'G_diffuse_start': irr_diffuse_start,
        'G_diffuse_end': irr_diffuse_end
    }


In [10]:
processed_files = set()
if Path(output_csv).exists():
    try:
        # Read only the 'filename' column to save memory
        df_done = pd.read_csv(output_csv, usecols=['filename'])
        processed_files = set(df_done['filename'])
        print(f"Found {len(processed_files)} files already processed.")
    except Exception:
        # If file is empty or corrupt, we just start from 0 (or you can check manually)
        pass

# --- STEP 2: Filter the main list ---
# This keeps only files whose names are NOT in the processed set.
# Using Path(f).name ensures this works whether your list contains strings or Path objects.
files_remaining = [f for f in files_to_process if Path(f).name not in processed_files]

print(f"Starting run for the remaining {len(files_remaining)} files...")

Found 855139 files already processed.
Starting run for the remaining 854577 files...


In [11]:
files_to_process = files_remaining

In [12]:
import csv
from tqdm import tqdm
from pathlib import Path

# Output file name
output_csv = "ape_diffuse_results.csv"

# These must match exactly the keys in the dictionary returned by your function
fieldnames = ['filename', 'APE', 'G_diffuse_start', 'G_diffuse_end']

# Check if we need to write the header
file_exists = Path(output_csv).exists()

with open(output_csv, mode='a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    
    # Write header only if the file is new
    if not file_exists:
        writer.writeheader()

    # Iterate through files with a progress bar
    for file_path in tqdm(files_to_process, desc="Computing APE & Diffuse"):
        res = get_APE_and_diff_irr(file_path)
        
        if res:
            writer.writerow(res)

Computing APE & Diffuse: 100%|██████████| 854577/854577 [3:26:43<00:00, 68.90it/s]   


In [2]:
df_ape = pd.read_csv('ape_diffuse_results.csv')
df_master = pd.read_csv('unfiltered_dataset_v2.csv')
df_final = pd.merge(df_master, df_ape, on='filename', how='left')

# 5. Save the new curated version
df_final.to_csv('curated_dataset_v3.csv', index=False)

  df_master = pd.read_csv('unfiltered_dataset_v2.csv')
