# Raw to CSV

**Goal**: Process the raw CSV files from the I-V Curve Tracer to get 1 CSV file with only the relevant columns.

**Output**: unfiltered_dataset.csv

### Libraries

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import time
from datetime import datetime 

In [None]:
base_path = Path("../Raw_Data/") # Base path for raw data files
output_csv = Path("unfiltered_dataset.csv") # Output CSV file path

## Localize all CSV files from 2023 to 2025

In [None]:
# Get all CSV files from 2023-2025
all_files = []
skip_prefixes = ('MS711', 'WS500', 'PV')

for year in range(2022, 2026):  # 2022-2025
    year_files = list(base_path.glob(f"{year}/**/*.csv"))
    # Filter out unwanted prefixes
    year_files = [f for f in year_files if not f.name.startswith(skip_prefixes)]
    all_files.extend(year_files)
    print(f"Found {len(year_files)} files in {year}")

total_files = len(all_files)
print(f"\nTotal: {total_files} CSV files from 2022-2025 (excluding MS711, WS500, PV)")

print(f"Files to process: {total_files}")

# Determine if we need header
write_header = not output_csv.exists() or output_csv.stat().st_size == 0

## Column extraction

Identified structure of raw CSV files. Some files have additional columns corresponding to diffuse irradiance.

In [None]:
base_row1_column_names = [
    'date', 'time_start', 'voltage_0', 'current_0', 'power_0', 'wavelength_0', 'spectralirr_0',
    'modtemp_c', 'modtemp_l', 'cell_v', 'irr_horiz_start', 'irr_incl20_start', 'airtemp',
    'humidity_rel', 'pressure_rel', 'air_density', 'wind_speed_kmh',
    'wind_dir', 'humidity_abs', 'pressure_abs', 'wind_speed_ms', 'irr_east_start',
    'irr_west_start', 'irr_floor_ref_start'
]

full_row1_column_names = base_row1_column_names + ['irr_diffuse_start', 'irr_incl15_start']

base_row2_column_names = [
    'module_name', 'time_end', 'voltage_1', 'current_1',
    'power_1', 'wavelength_1', 'spectralirr_1', 'cell_v_end',
    'irr_horiz_end', 'irr_incl20_end', 'irr_east_end', 'irr_west_end',
    'irr_floor_ref_end']

full_row2_column_names = base_row2_column_names + ['irr_diffuse_end', 'irr_incl15_end']

# Process file function

This function extracts all the needed data from the raw CSV files. It also computes other variables for the dataset based on the I-V curve and spectroradiometer measurements.

In [None]:
def process_file(file_path):
    filename = file_path.name

    ## Read CSV 
    opts = dict(sep=';', header=None, na_values='--', engine='python')
    try:
        df = pd.read_csv(file_path, encoding='utf-8', encoding_errors='strict', **opts)
    except UnicodeDecodeError:
        # Fallback to latin1 if UTF-8 fails (handles the \xe3 byte)
        try:
            df = pd.read_csv(file_path, encoding='latin1', encoding_errors='replace', **opts)
        except Exception as e:
            return None
    except Exception as e:
        return None

    # Check number of columns (26 or 24 expected)
    num_cols = df.shape[1]
    if num_cols == 26:
        current_row1_names = full_row1_column_names
        current_row2_names = full_row2_column_names
    elif num_cols == 24:
        current_row1_names = base_row1_column_names
        current_row2_names = base_row2_column_names
    else:
        return None
    
    # Parse Row 1
    try:
        row1_list = df.iloc[0, :].dropna().tolist()
        if len(row1_list) == len(current_row1_names):
            df_row1 = pd.DataFrame([row1_list], columns=current_row1_names)
        else: # Fallback
            df_row1 = pd.DataFrame([df.iloc[0, :len(current_row1_names)].tolist()], columns=current_row1_names)
        row1 = df_row1.iloc[0]
    except Exception as e:
        return None

    # Parse Row 2
    try:
        row2_list = df.iloc[1, :].dropna().tolist()
        if len(row2_list) == len(current_row2_names):
            df_row2 = pd.DataFrame([row2_list], columns=current_row2_names)
        else:
            # Fallback for Row 2: Explicitly pull indices to handle the gaps
            row2_idxs = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 21, 22, 23]
            if num_cols == 26:
                row2_idxs += [24, 25] # Add Diffuse and GTI 15 for 26-col files
            df_row2 = pd.DataFrame([df.iloc[1, row2_idxs].tolist()], columns=current_row2_names)
        row2 = df_row2.iloc[0]
    except Exception as e:
        return None

    ## Generate timestamp from filename
    try:
        name = filename.replace('.csv', '')
        parts = name.split('_')
        # Extract date/time parts
        day = int(parts[1])
        month = int(parts[2])
        year = int(parts[3])
        hour = int(parts[4])
        minute = int(parts[5])
        second_str = parts[6].split(' ')[0]
        second = int(second_str)
        
        timestamp = datetime(year, month, day, hour, minute, second)
        
        module_name = row2["module_name"]
    except Exception as e:
        return None
    
    ## Spectral Integral (spectroradiometer measurements)

    # Constants
    H_PLANCK = 6.626e-34  # J*s
    C_LIGHT = 2.998e8     # m/s
    Q_ELEM = 1.602e-19    # Coulombs
    HC_OVER_Q = (H_PLANCK * C_LIGHT) / Q_ELEM  * 1e9 # in units of eV*nm

    try:
        wavelength = df.iloc[:2048, 5].copy() # Columns 5 is wavelength (nm) 
        spectral_irradiance = df.iloc[:2048, 6].copy() # Column 6 is spectral irradiance (W/m2/nm)
        integral_E = np.trapezoid(spectral_irradiance, x=wavelength) # Total spectral irradiance (W/m2)

        weighted_irradiance = spectral_irradiance * wavelength
        integral_E_lambda = np.trapezoid(weighted_irradiance, x=wavelength)

        ape_val = HC_OVER_Q * (integral_E / integral_E_lambda) # Average Proton Energy (APE)

    except Exception as e:
        integral_E = np.nan
        ape_val = np.nan

    ##  Irradiance measurements
    try:
        # Start irradiances (from row1)
        irr_horiz_start = row1["irr_horiz_start"]
        irr_incl20_start = row1["irr_incl20_start"]
        irr_incl15_start = row1.get("irr_incl15_start", np.nan)
        irr_east_start = row1["irr_east_start"]
        irr_west_start = row1["irr_west_start"]
        irr_floor_ref_start = row1["irr_floor_ref_start"]
        irr_diffuse_start = df.iloc[0, 24] if num_cols == 26 else np.nan 
        
        # End irradiances (from row2)
        irr_horiz_end = row2["irr_horiz_end"]
        irr_incl20_end = row2["irr_incl20_end"]
        irr_incl15_end = row2.get("irr_incl15_end", np.nan)
        irr_east_end = row2["irr_east_end"]
        irr_west_end = row2["irr_west_end"]
        irr_floor_ref_end = row2["irr_floor_ref_end"]
        irr_diffuse_end = df.iloc[1, 24] if num_cols == 26 else np.nan

    except Exception as e:
        return None

    ## Module temperature measurements
    try:
        modtemp_c = row1["modtemp_c"]
        modtemp_l = row1["modtemp_l"]
    except Exception as e:
        return None


    ## Electrical measurements
    try: 
        # Get voltage, current, power columns
        measurements = df.iloc[:, [2, 3, 4]].copy()
        measurements.columns = ['v', 'i', 'p']
        # Discard the first values (start from voltage close to 0)
        min_idx = measurements['v'].idxmin()
        measurements = measurements.iloc[min_idx:].reset_index(drop=True)

        v, i = measurements['v'], measurements['i']
        if len(v) < 10: return None # Not enough data points 

    except Exception as e:
        return None

    try:
        # Extract key electrical parameters
        V_ini, I_ini = v.iloc[0], i.iloc[0] # Initial current and voltage
        Voc_obs = v.max() # Voc
        P = v * i # Power
        Pmpp = P.max() # Maximum Power
        mpp_idx = P.idxmax() # Index of MPP
        Vmpp, Impp = v[mpp_idx], i[mpp_idx] # Values at MPP
         
        # Isc & Rsh (Initial Segment: V < Voc/4)
        mask_ini = v < (Voc_obs / 6)  # Using Voc/6 to ensure we capture the linear region near Isc
        Isc, error_Isc, Rsh = np.nan, np.nan, np.nan
        if mask_ini.sum() >= 3:
            # Linear regression to find Isc (intercept) and Rsh (slope)
            vx, iy = v[mask_ini], i[mask_ini]
            A = np.vstack([vx, np.ones_like(vx)]).T
            m, b = np.linalg.lstsq(A, iy, rcond=None)[0]

            Isc = float(b)
            Rsh = 1 / abs(m) if abs(m) > 1e-9 else np.nan
            # Normalized RMSE for validation
            yhat = m * vx + b
            rmse = np.sqrt(np.mean((iy - yhat)**2))
            error_Isc = (rmse / Isc) if Isc > 0 else 1.0 # Normalize error as a percentage of the current signal

        # Voc & Rs (Final Segment: V > 0.9 * Voc)
        mask_fin = v > (Voc_obs * 0.9)
        Voc_extrap, R2_Voc, Rs = np.nan, np.nan, np.nan
        if mask_fin.sum() >= 3:
            # Linear regression
            vx, iy = v[mask_fin], i[mask_fin]
            A = np.vstack([vx, np.ones_like(vx)]).T
            m, b = np.linalg.lstsq(A, iy, rcond=None)[0]
            # Voc is where I=0 -> 0 = mV + b -> V = -b/m
            Voc_extrap = -b / m if abs(m) > 1e-9 else Voc_obs
            Rs = 1 / abs(m) if abs(m) > 1e-9 else np.nan
            # R2 Calculation
            yhat = m*vx + b
            ss_res = np.sum((iy - yhat)**2)
            ss_tot = np.sum((iy - iy.mean())**2)
            R2_Voc = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0

        # Fill Factor
        # Using extrapolated values if available 
        Isc_val = Isc if Isc > 0 else i.iloc[0]
        Voc_val = Voc_extrap if Voc_extrap > 0 else Voc_obs
        FF = Pmpp / (Isc_val * Voc_val) if (Isc_val * Voc_val) > 0 else np.nan
    
    except Exception as e:
        return None
    
    ## Build final dataframe
    try:
        final_df = pd.DataFrame({
            # Metadata
            'filename': [filename],
            'module_name': [module_name],
            'timestamp': [timestamp],

            # MPP
            'Vmpp': [Vmpp],
            'Impp': [Impp],
            'Pmpp': [Pmpp],

            # Electrical characteristics
            'Voc': [Voc_obs],
            'Isc': [Isc],
            'R2_Voc': [R2_Voc],
            'NRMSE_Isc': [error_Isc],
            'V_ini': [V_ini],
            'I_ini': [I_ini],
            'FF': [FF],
            'Rs': [Rs],
            'Rsh': [Rsh],

            # Irradiance
            'G_spec_int': [integral_E],
            'G_tilt20_start': [irr_incl20_start],
            'G_tilt15_start': [irr_incl15_start],
            'G_horiz_start': [irr_horiz_start],
            'G_east_start': [irr_east_start],
            'G_west_start': [irr_west_start],
            'G_refl_start': [irr_floor_ref_start],
            'G_diffuse_start': [irr_diffuse_start],

            'G_horiz_end': [irr_horiz_end],
            'G_tilt20_end': [irr_incl20_end],
            'G_tilt15_end': [irr_incl15_end],
            'G_east_end': [irr_east_end],
            'G_west_end': [irr_west_end],
            'G_refl_end': [irr_floor_ref_end],
            'G_diffuse_end': [irr_diffuse_end],

            # Spectral characteristics
            'APE': [ape_val],

            # Environmental characteristics
            'module_temperature_center': [modtemp_c],
            'module_temperature_lateral': [modtemp_l],
            'air_temperature': [row1['airtemp']],
            'relative_humidity': [row1['humidity_rel']],
            'air_density': [row1['air_density']],
            'abs_pressure': [row1['pressure_abs']],
            'wind_speed_ms': [row1['wind_speed_ms']],
            'wind_direction': [row1['wind_dir']]
        })
    except Exception as e:
        return None

    return final_df

# Process Loop

In [None]:
processed_count = 0
skipped_due_to_error = 0
start_time = time.time()

with open(output_csv, 'a', newline='', encoding='utf-8') as output_f:
    for i, file_path in enumerate(all_files):  
        filename = file_path.name  
        
        try:
            result_df = process_file(file_path)
        except Exception as e:
            continue

        if result_df is None:
            continue

        result_df.to_csv(output_f, header=write_header, index=False)
        write_header = False # Update header flag after first write

# Final summary
print(f"\nâœ… Extraction complete. Dataset saved to {output_csv}")