As described in the preregistration of methods (@REF), the data has been collected using LabStreamLayer, allowing for recording of precisely synchronized streams. These streams include:
- MicStream (16kHz)
- WebcamFrameStream (ca. 60 Hz)
- MarkerStream (sent from custom buttonbox)
- BalanceBoardStream (500 Hz)

LSL outputs single .xdf file that contains all the data streams. This script serves for extracting the streams and creating raw files for further processing.

This script therefore creates each stream as a csv file (one for whole session, and one for ech trial), and  additionaly creates audio and video files for each trial.

Additionally, in this script, we:
- correct faulty delimitations of trials
- align 16kHz audio and additional 48kHz audio that has been recorded outside of LSL, and cut this aligned audio to trial sized files
- combine video and audio


In [3]:
#| code-fold: true
#| code-summary: Code to prepare the environment

# import packages
import os
import pyxdf
import glob
import pandas as pd
import numpy as np
import wave, struct, random
from scipy.io import wavfile
import noisereduce as nr
import cv2
import ffmpeg

# Set folders
curfolder = os.getcwd()
# If folder data doesn't exist, create it
if not os.path.exists(curfolder+'\\data\\'):
    os.makedirs(curfolder+'\\data\\')
datafolder = curfolder+'\\data\\'

# Also error_logs
if not os.path.exists(datafolder+'\\error_logs\\'):
    os.makedirs(datafolder+'\\error_logs\\')
errorlogs = datafolder+'\\error_logs\\'

experiment_to_process = curfolder + '\\raw\\'

# Also CsvDataTS_raw
if not os.path.exists(datafolder+'\\Data_processed\\CsvDataTS_raw\\'):
    os.makedirs(datafolder+'\\Data_processed\\CsvDataTS_raw\\')
outputfolder = datafolder+'\\Data_processed\\CsvDataTS_raw\\' # outputfolder raw

# Also Data_trials
if not os.path.exists(datafolder+'\\Data_processed\\Data_trials\\'):
    os.makedirs(datafolder+'\\Data_processed\\Data_trials\\')
trialfolder = datafolder+'\\Data_processed\\Data_trials\\' # outputfolder trialbased

# ap all the folders in the target folder
datafolders = glob.glob(experiment_to_process+'*\\')

# Extract the folder IDs
datafolders_id = [x.split('\\')[-2] for x in datafolders]

# print(curfolder)
# print(targetfolder)
print(datafolders_id)

# Identify all xdf files and all the associated triallist info
xdffiles = []
trialdatas = []

for i in datafolders_id:
    file = glob.glob(experiment_to_process+i+'\\*.xdf')
    trialfile = experiment_to_process+i+'\\'+i+'_results'+'.csv'
    trialdatas.append(trialfile)
    xdffiles.extend(file)

# These are the xdf files we need to process
print(xdffiles[0:10])
print(trialdatas[0:10])

['53_2']
['f:\\desync_xdf\\raw\\53_2\\53_2_session.xdf']
['f:\\desync_xdf\\raw\\53_2\\53_2_results.csv']


# Extracting streams from XDF file

Using `pyxdf` package, we can extract the streams from the XDF file.

First, we extract streams for the whole session. Then, we cut these streams into trial-sized chunks, based on the marker stream that indicates the start of each trial. Moreover, we connect the trial with information from the PsychoPy log file, which includes metadata about the trial (i.e., concept, correction number, etc.)

(Note that this code takes some time to execute.)


In [4]:
#| code-fold: true
#| code-summary: Code with functions
#| eval: false

# Audio write function
def to_audio(fileloc, timeseries, samplerate = 16000, channels = 1):
    obj = wave.open(fileloc,'w')
    obj.setnchannels(channels) # mono
    obj.setsampwidth(2)
    obj.setframerate(float(samplerate))
    for i in timeseries:
        data = struct.pack('<h', int(i[0]))
        obj.writeframesraw( data )
    obj.close()

# Function to retrieve closest value
def find_closest_value_and_retrieve(df, target_number, target_column, retrieve_column):
    # Get the absolute differences between a value in column and the target number
    differences = abs(df[target_column] - target_number)
    
    # Find the index of the minimum difference
    min_difference_index = differences.idxmin()
    
    # Retrieve the corresponding value from the column
    result_value = df.loc[min_difference_index, retrieve_column]

    return result_value

def write_video(vidloc, fourcc, originalfps, frameWidth, frameHeight, capture, frames):
    out = cv2.VideoWriter(vidloc, fourcc, fps = originalfps, frameSize = (int(frameWidth), int(frameHeight)))
    print('Looping over frames')

    # index the frames of the current trial ##FLAG! I have the feeling you are repeating this type of video writing a lot, why not a handy function
    for fra in frames:
        capture.set(cv2.CAP_PROP_POS_FRAMES, fra)
        ret, frame = capture.read()
        if ret:
            out.write(frame)
            
        if not ret:
            print('a frame was dropped: ' + str(fra))
        
    capture.release()
    out.release()

# Alternative if frame query does not work
def write_video2(vidloc, fourcc, originalfps, frameWidth, frameHeight, capture, frame_range):
    out = cv2.VideoWriter(vidloc, fourcc, fps=originalfps, 
                            frameSize=(int(frameWidth), int(frameHeight)))

    frame_count = 0
    while True:
        ret, frame = capture.read()
        if not ret:
            break

        if frame_count in frame_range:
            out.write(frame)

        frame_count += 1

    capture.release()
    out.release()

In [6]:
#| code-fold: false
#| code-summary: Code to extract streams from XDF files
#| eval: false

errorlist = []

for dat in datafolders_id:

    print('Loading in data from participant: ' + dat)
    trialdata = pd.read_csv(experiment_to_process+dat+'\\'+dat+'_results.csv', sep=",")
    #print(trialdata)
    
    # Get the xdf file
    files = glob.glob(experiment_to_process+dat+'\\*.xdf')
    
    # Look into the files and if there is some with initiated, set Except=True
    if any('reinitiated' in file for file in files):
        Except = True
    else:
        Except = False

    # Ignore files that have reinitiated in the name
    files = [x for x in files if 'reinitiated' not in x]
    files = [x for x in files if 'onlytpose' not in x]

    streams, header = pyxdf.load_xdf(files[0], 
                                     synchronize_clocks=True,
                                     dejitter_timestamps=True,     
                                     handle_clock_resets=False)   

    # We need to handle MarkerStream first
    # Process the MyMarkerStream first
    marker_stream = None
    for stream in streams:
        if stream['info']['name'][0] == 'MyMarkerStream':
            marker_stream = stream
            break

    if marker_stream is None:
        print(f"No MyMarkerStream found for participant {dat}, skipping...")
        continue

    # Save MyMarkerStream as CSV
    print("Processing MyMarkerStream...")
    timevec = marker_stream['time_stamps']
    timeseries = marker_stream['time_series']
    matrix_aux = np.vstack([np.transpose(timevec), np.transpose(timeseries)])
    matrix = np.transpose(matrix_aux)
    df_marker = pd.DataFrame(matrix)
    marker_csv_path = outputfolder + dat + '_MyMarkerStream_nominal_srate0.csv'
    df_marker.to_csv(marker_csv_path, index=False)

    # Load the MyMarkerStream
    markers = pd.read_csv(outputfolder+dat+'_MyMarkerStream_nominal_srate0.csv')
    # Find the row in second column where it says 'Experiment_end'

    # If last row of markers is not Next_word or Experiment_end, check if it's Trial_start
    if markers.iloc[-1][1] != 'Next_word' and markers.iloc[-1][1] != 'Experiment_end':
        if 'Trial_start' in markers.iloc[-1][1]:
            print('Last row is buggy marker, remove...')
            markers = markers.iloc[:-1]

    # We go through each stream and save it as a csv, 
    # If it's Mic stream, also as a .wav file
    for stream in streams:
        #print(streams)

        if stream['info']['name'][0] == 'MyMarkerStream':
            continue  # Skip as it has already been processed
        
        timeseriestype = stream['info']['name'][0]
        samplerate = round(float(stream['info']['nominal_srate'][0]))
        # In the xdf loop over the streams and save it as csv if not yet exists
        channelcount = stream['info']['channel_count'][0]
        print('working on stream: ' + timeseriestype + '  with a channel count of ' + str(channelcount) +'\n and a sampling rate of ' + str(samplerate))
        timevec = stream['time_stamps']
        timeseries = stream['time_series']
        matrix_aux = np.vstack([np.transpose(timevec),np.transpose(timeseries)])
        matrix = np.transpose(matrix_aux)
        df_lab = pd.DataFrame(matrix)
        df_lab.to_csv(outputfolder+dat+'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'.csv',index=False)


        # For audio data also create a wav file
        if timeseriestype == 'Mic':
            if not os.path.exists(outputfolder+'Audio\\'):
                os.makedirs(outputfolder+'Audio\\')
            wavloc = outputfolder+'Audio/'+dat+'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'.wav'
            to_audio(wavloc, timeseries)
            # Load data
            rate, data = wavfile.read(wavloc)
            # Perform noise reduction
            reduced_noise = nr.reduce_noise(y=data, sr=rate, n_std_thresh_stationary=1.5,stationary=True)
            # If folder Audio doesn't exist, create it
            wavloc2 = outputfolder+'Audio/'+dat+'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'_denoised.wav'
            wavfile.write(wavloc2, rate, reduced_noise)

        print('done with processing a complete time series and audio data')
        print('we will now start making trial snipped data') 
        
        # Cut all timeseries to trial level based on the markers
        if timeseriestype != 'MyMarkerStream':
            
            beginlist = []
            endlist = []
            timestamps = []
            timestamps_2 = []
            tpose_starts = []
            tpose_ends = []

            # Iterate over markers and save times of trial starts and ends
            for row in markers.iterrows():
                # Accessing positional elements explicitly with .iloc
                if 'Trial_start' in row[1].iloc[1] or 'Practice trial starts' in row[1].iloc[1]:
                    beginlist.append(row[1].iloc[0])
                if 'Trial_end' in row[1].iloc[1] or 'Practice trial ends' in row[1].iloc[1]:
                    endlist.append(row[1].iloc[0])
                if 'Experiment_start' in row[1].iloc[1]:
                    timestamps.append(row[1].iloc[0])
                if 'New block starts' in row[1].iloc[1]:
                    timestamps_2.append(row[1].iloc[0])
                if 'Tpose starts' in row[1].iloc[1]:
                    tpose_starts.append(row[1].iloc[0])
                if 'Tpose ends' in row[1].iloc[1]:
                    tpose_ends.append(row[1].iloc[0])
                    
            # Converting coefficient for lsl to psychopy time
            exp_start_pp = float(trialdata['exp_start'][0])
            # Get to lsl_to_pp coefficient
            if Except == False:
                if timestamps != []:
                    lsl_to_pp = timestamps[0] - exp_start_pp
                else:
                    # If there is three timestamps in timestamps2, then all is fine and we can use the first one
                    if len(timestamps_2) == 3:
                        block_start_pp = float(trialdata['block_start'][0])
                    # If there is only two timestamps, then we need to assess the second unique value from trialdata
                    elif len(timestamps_2) == 3:
                        # Get the unique values from trialdata
                        unique_values = trialdata['block_start'].unique()
                        # Get the second unique value
                        block_start_pp = unique_values[1]
                    else:
                        unique_values = trialdata['block_start'].unique()
                        # We take the third
                        block_start_pp = unique_values[2]
                        
                    lsl_to_pp = timestamps_2[0] - block_start_pp

            elif Except == True and dat != '36_2':
                if timestamps != []:
                    lsl_to_pp = timestamps[0] - exp_start_pp
                else:
                    if len(timestamps_2) == 3:
                        block_start_pp = float(trialdata['block_start'][0])
                    else:
                        unique_values = trialdata['block_start'].unique()
                        block_start_pp = unique_values[0]

                    lsl_to_pp = timestamps_2[0] - block_start_pp

            elif Except == True and dat == '36_2':
                if timestamps != []:
                    lsl_to_pp = timestamps[0] - exp_start_pp
                else:
                    if len(timestamps_2) == 3:
                        block_start_pp = float(trialdata['block_start'][0])
                    else:
                        unique_values = trialdata['block_start'].unique()
                        block_start_pp = unique_values[1]

                    lsl_to_pp = timestamps_2[0] - block_start_pp

        
            # Now we can proceed to cutting   
            for i in range(len(beginlist)):
                # Prepare the range of the trial
                begin = beginlist[i]
                print(begin)
                end = endlist[i]
                indices = (df_lab.loc[:,0] >= begin) & (df_lab.loc[:,0] <= end)
                print(indices)
                beginst = min(df_lab.loc[:,0]) # start time of the timeseries
                endst = max(df_lab.loc[:,0])  # end time of the timeseries
                subset = df_lab.loc[indices, :]
                print(subset)
                # Convert the beginst to psychopy time
                beginst_pp = begin - lsl_to_pp
                # Now find in trialdata the closest time to the beginst_pp to gather info
                # Whether it is practice or trial
                practice = find_closest_value_and_retrieve(trialdata, beginst_pp, 'trial_start', 'practice')
                if practice == 'practice':
                    trialtype = 'pr'
                else:
                    trialtype = 'trial'
                
                # Which participant it is
                cycle = find_closest_value_and_retrieve(trialdata, beginst_pp, 'trial_start', 'cycle')
                if cycle == 0:
                    participant = 'p0'
                else:
                    participant = 'p1'

                # What concept it is
                word = find_closest_value_and_retrieve(trialdata, beginst_pp, 'trial_start', 'word')
                # Modality
                modality = find_closest_value_and_retrieve(trialdata, beginst_pp, 'trial_start', 'modality')
                # Correction, if applicable
                correction_info = find_closest_value_and_retrieve(trialdata, beginst_pp, 'trial_start', 'correction')
                if correction_info == 0:
                    correction = '_c0'
                elif correction_info == 1:
                    correction = '_c1'
                elif correction_info == 2:
                    correction = '_c2'
                else:
                    correction = ''
                
                # Continue saving
                if(len(subset.axes[0])<2):
                    errorlist.append(dat + " for "+ timeseriestype + " for trial " + str(i) + 'NO DATA WITHIN RANGE...')
                if(len(subset.axes[0])>2):
                     # Save subset to csv
                      subset.to_csv(trialfolder+dat+'_'+trialtype+'_'+ str(i) +'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'_'+participant+'_'+word+'_'+modality+correction+'.csv', index=False)
                      if timeseriestype == 'Mic':
                            # If folder Audio doesn't exist, create it
                            if not os.path.exists(trialfolder+'Audio\\'):
                                os.makedirs(trialfolder+'Audio\\')
                            wavloc = trialfolder+'Audio/'+dat+'_'+trialtype+'_'+ str(i) +'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'_'+participant+'_'+word+'_'+modality+correction+'.wav'
                            to_audio(wavloc, timeseries[indices])
                            # Also apply denoising
                            reduced_noiseclip = reduced_noise[indices]
                            wavloc2 = trialfolder+'Audio/'+dat+'_'+trialtype+'_'+ str(i) +'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'_'+participant+'_'+word+'_'+modality+correction+'_denoised.wav'
                            wavfile.write(wavloc2, rate, reduced_noiseclip)
            
            # Get information about the tpose for camera
            if timeseriestype == 'MyWebcamFrameStream':
                for i in range(len(tpose_starts)):
                    begin = tpose_starts[i]
                    end = tpose_ends[i]
                    indices = (df_lab.loc[:,0] > begin) & (df_lab.loc[:,0] < end)
                    beginst = min(df_lab.loc[:,0])
                    endst = max(df_lab.loc[:,0])
                    subset = df_lab.loc[indices, :]
                    # Save subset to csv
                    subset.to_csv(trialfolder+dat+'_'+'tpose_'+ str(i) +'_'+timeseriestype+'_nominal_srate'+str(samplerate)+'.csv', index=False)
        
        # After every stream we'll save the error log
        errors = pd.DataFrame(errorlist, columns=['file_error'])
        # Get todays date
        today = pd.Timestamp("today").strftime("%Y_%m")
        file_path = errorlogs+'error_log_cuttingtrails' + today + '.csv'
        errors.to_csv(file_path, index=False) 
        
print('Were done: proceed to snipping videos to triallevel')

Loading in data from participant: 53_2
Processing MyMarkerStream...
working on stream: MyWebcamFrameStream  with a channel count of 1
 and a sampling rate of 500


  if markers.iloc[-1][1] != 'Next_word' and markers.iloc[-1][1] != 'Experiment_end':
  if 'Trial_start' in markers.iloc[-1][1]:


done with processing a complete time series and audio data
we will now start making trial snipped data
8899.48028193189
0          False
1          False
2          False
3          False
4          False
           ...  
1039480    False
1039481    False
1039482    False
1039483    False
1039484    False
Name: 0, Length: 1039485, dtype: bool
                 0       1
16470  8899.480713  4657.0
16471  8899.482773  4657.0
16472  8899.484833  4657.0
16473  8899.486894  4657.0
16474  8899.488954  4657.0
...            ...     ...
18965  8904.620496  4944.0
18966  8904.622556  4944.0
18967  8904.624616  4945.0
18968  8904.626676  4945.0
18969  8904.628736  4945.0

[2500 rows x 2 columns]
8914.456248546887
0          False
1          False
2          False
3          False
4          False
           ...  
1039480    False
1039481    False
1039482    False
1039483    False
1039484    False
Name: 0, Length: 1039485, dtype: bool
                 0       1
23740  8914.457154  5393.0
23741  89

Now we have raw data for each trial in csv and wav, but we still need to write videos for each trial. We will use the trial-cut files to access the range of frames in the original raw video file and cut it out from it


In [11]:
#| code-fold: true
#| code-summary: Code to prepare folders

# This is a folder with timeseries
tsfolder = datafolder+'\\Data_processed\\Data_trials\\'
# This is where the raw long video is
videofolder = curfolder + '\\raw\\'

In [12]:
#| code-fold: false
#| code-summary: Code to cut video intro trial-sized snippets
#| eval: false

# Loop through the csv's in tsfolder that has string 'MyWebcamStream' in name
for file in os.listdir(tsfolder):

    if '53_2' not in file:
        continue

    if 'MyWebcamFrameStream' in file:
        if '_rein_' in file:
            print('Now processing file '+ file)
            # If it is a tpose file, we skip it for now
            if 'tpose' in file:
                continue

            # The name looks like this 0_1_trial_0_MyWebcamFrameStream_nominal_srate500_p0_bitter_geluiden.csv
            dyadIndex = file.split('_')[0]   # this is dyad number
            partIndex = file.split('_')[1]   # this is part of the session
            sessionIndex = dyadIndex + '_' + partIndex # this is the session index
            trialIndex = file.split('_')[4] # this is trial number
            participant = file.split('_')[8] # this is participant 0/1
            word = file.split('_')[9] # this is the concept
            modality = file.split('_')[10].split('.')[0] # this is the modality

            # Assess the correction
            if 'c0' in file:
                correction = '_c0'
            elif 'c1' in file:
                correction = '_c1'
            elif 'c2' in file:
                correction = '_c2'
            else:
                correction = ''

            # Assess the trial type
            if 'pr' in file:
                trialtype = 'pr'
            else:
                trialtype = 'trial'

            trialdata = pd.read_csv(tsfolder+file)
            #print(trialdata)

            # This is the location where the video will be saved
            vidloc = trialfolder+sessionIndex+'_rein_'+trialtype+'_'+ str(trialIndex) +'_'+participant+'_'+word+'_'+modality+correction+'_video_raw'+'.avi'

            # Check if it exists, if yes, skip
            # if os.path.exists(vidloc):
            #     print('Video already exists, skipping...')
            #     continue
            
            videolong = videofolder+sessionIndex+'\\'+sessionIndex+'_reinitiated-video.avi' # this is the long video 
        else:
            print('Now processing file '+ file)
            # If it is a tpose file, we skip it for now
            if 'tpose' in file:
                continue

            # The name looks like this 0_1_trial_0_MyWebcamFrameStream_nominal_srate500_p0_bitter_geluiden.csv
            dyadIndex = file.split('_')[0]   # this is dyad number
            partIndex = file.split('_')[1]   # this is part of the session
            sessionIndex = dyadIndex + '_' + partIndex # this is the session index
            trialIndex = file.split('_')[3] # this is trial number
            participant = file.split('_')[7] # this is participant 0/1
            word = file.split('_')[8] # this is the concept
            modality = file.split('_')[9].split('.')[0] # this is the modality

            # Assess the correction
            if 'c0' in file:
                correction = '_c0'
            elif 'c1' in file:
                correction = '_c1'
            elif 'c2' in file:
                correction = '_c2'
            else:
                correction = ''

            # Assess the trial type
            if 'pr' in file:
                trialtype = 'pr'
            else:
                trialtype = 'trial'

            trialdata = pd.read_csv(tsfolder+file)
            #print(trialdata)

            # This is the location where the video will be saved
            vidloc = trialfolder+sessionIndex+'_'+trialtype+'_'+ str(trialIndex) +'_'+participant+'_'+word+'_'+modality+correction+'_video_raw'+'.avi'

            # # Check if it exists, if yes, skip
            # if os.path.exists(vidloc):
            #     print('Video already exists, skipping...')
            #     continue
            
            videolong = videofolder+sessionIndex+'\\'+sessionIndex+'-video.avi' # this is the long video 
        #print(videolong)
        begin_time = trialdata['0'].min() # begin time of the trial
        end_time = trialdata['0'].max() # end time of the trial
        # Get the begin and end frame
        begin_frame = trialdata['1'].min().astype(int)
        end_frame = trialdata['1'].max().astype(int)
        totframes = end_frame-begin_frame # total number of frames in the trial
        frames = range(begin_frame, end_frame) # get all the frames in trial
        #print(frames)
        # Load in the long video
        print('Loading the original video')
        capture = cv2.VideoCapture(videolong) 

        # what is the original fps of the video
        originalfps = round((totframes/(end_time-begin_time)),3)
        #print('original fps: '+str(originalfps))
        
        # Metadata
        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        frameWidth = capture.get(cv2.CAP_PROP_FRAME_WIDTH)
        frameHeight = capture.get(cv2.CAP_PROP_FRAME_HEIGHT)

        # Start writing video
        print('Starting to write the video')
        write_video(vidloc, fourcc, originalfps, frameWidth, frameHeight, capture, frames)
        print('Video is done')

print('All done!')

Now processing file 53_2_pr_0_MyWebcamFrameStream_nominal_srate500_p0_vliegtuig_gebaren_c0.csv
Loading the original video
Starting to write the video
Looping over frames
Video is done
Now processing file 53_2_pr_1_MyWebcamFrameStream_nominal_srate500_p0_vliegtuig_gebaren_c1.csv
Loading the original video
Starting to write the video
Looping over frames
Video is done
Now processing file 53_2_pr_2_MyWebcamFrameStream_nominal_srate500_p0_vliegtuig_gebaren_c2.csv
Loading the original video
Starting to write the video
Looping over frames
Video is done
Now processing file 53_2_pr_3_MyWebcamFrameStream_nominal_srate500_p0_glimlach_gebaren_c0.csv
Loading the original video
Starting to write the video
Looping over frames
Video is done
Now processing file 53_2_pr_4_MyWebcamFrameStream_nominal_srate500_p0_glimlach_gebaren_c1.csv
Loading the original video
Starting to write the video
Looping over frames
Video is done
Now processing file 53_2_pr_5_MyWebcamFrameStream_nominal_srate500_p0_glimlach_geb

# Concatenating audio and video

We do not necessarilly need to have video and audio combined, but we do want to check whether they are synchronized (they should be, but it's always good to do a sanity check).

We will combine the audio and video for each trial using `ffmpeg` package, and save it as a new video file.


In [None]:
#| code-fold: false
#| code-summary: Code to prepare environment

wavloc = trialfolder+'Audio\\'
wavfiles = glob.glob(wavloc+'*denoised.wav')
audiovideo = datafolder+'Data_processed\\AudioVideo\\'

# if the folder doesn't exist, create it
if not os.path.exists(audiovideo):
    os.makedirs(audiovideo)
    
videofolder = datafolder+'Data_processed\\Data_trials\\'

# Inititate error log
errorlog2 = []

# loop over Audio files
for file in wavfiles:

    print('Now processing file '+file)

    filename = file.split('\\')[-1].split('.')[0]

    dyadIndex = filename.split('_')[0]   # this is dyad number
    partIndex = filename.split('_')[1]   # this is part of the session
    sessionIndex = dyadIndex + '_' + partIndex # this is the session index
    print(sessionIndex)

    if sessionIndex != '53_2':
        continue

    if '_rein_' in file:
        trialIndex = filename.split('_')[4] # this is trial number
        participant = filename.split('_')[8] # this is participant 0/1
        word = filename.split('_')[9] # this is the word
        modality = filename.split('_')[10] # this is the modality

    else: 
        trialIndex = filename.split('_')[3] # this is trial number
        participant = filename.split('_')[7] # this is participant 0/1
        word = filename.split('_')[8] # this is the word
        modality = filename.split('_')[9] # this is the modality

    # handle correction
    if 'c0' in file:
        correction = '_c0'
    elif 'c1' in file:
        correction = '_c1'
    elif 'c2' in file:
        correction = '_c2'
    else:
        correction = ''
    # trial type
    if '_pr_' in file:
        trialtype = 'pr'
    else:
        trialtype = 'trial'

    # if it's corrected file or not
    if 'corrected' in file:
        add = '_corrected'
    else:
        add = ''

    if '_rein_' in file:
        output_path = os.path.abspath(os.path.join(audiovideo, f"{sessionIndex}_rein_{trialtype}_{trialIndex}_{participant}_{word}_{modality}{correction}_final.avi"))

    else:
        output_path = os.path.abspath(os.path.join(audiovideo, f"{sessionIndex}_{trialtype}_{trialIndex}_{participant}_{word}_{modality}{correction}_final.avi"))

    # if os.path.exists(output_path):
    #     print('Video already exists, skipping...')
    #     continue

    #load in the audio
    print('Loading the audio')
    audio_path = os.path.join(wavloc, file)
    if not os.path.exists(audio_path):
        print(f"Audio file not found: {audio_path}")

    # input the video with ffmpg
    input_audio = ffmpeg.input(audio_path)
    print(input_audio)
    #load in the video with matchich trialIndex and SessionIndex
    print('Loading the video')
    if '_rein_' in file:
        video_path = os.path.join(videofolder + f"{sessionIndex}_rein_{trialtype}_{trialIndex}_{participant}_{word}_{modality}{add}{correction}_video_raw.avi")
    else:
        video_path = os.path.join(videofolder + f"{sessionIndex}_{trialtype}_{trialIndex}_{participant}_{word}_{modality}{add}{correction}_video_raw.avi")

    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
    input_video = ffmpeg.input(video_path)
    print(input_video)
    
    #combine the audio and video
    print('Combining audio and video')

    try:
        ffmpeg.concat(input_video, input_audio, v=1, a=1).output(
            output_path,
            vcodec='libx264',
            acodec='aac',
            video_bitrate='2M',         
            
            ).run(overwrite_output=True)
    except:
        print('Error in combining audio and video')
        errorlog2.append(file)
        continue
        
# Write the error log
errorlog = pd.DataFrame(errorlog2)
errorlog.to_csv(audiovideo + 'errorlog2_audio_video.csv', index = False)
print('All done!')

Now processing file f:\desync_xdf\data\\Data_processed\Data_trials\Audio\53_2_pr_0_Mic_nominal_srate16000_p0_vliegtuig_gebaren_c0_denoised.wav
53_2
Loading the audio
input(filename='f:\\desync_xdf\\data\\\\Data_processed\\Data_trials\\Audio\\53_2_pr_0_Mic_nominal_srate16000_p0_vliegtuig_gebaren_c0_denoised.wav')[None] <b811b8b9acc2>
Loading the video
input(filename='f:\\desync_xdf\\data\\Data_processed\\Data_trials\\53_2_pr_0_p0_vliegtuig_gebaren_c0_video_raw.avi')[None] <99745973a6a1>
Combining audio and video
Now processing file f:\desync_xdf\data\\Data_processed\Data_trials\Audio\53_2_pr_1_Mic_nominal_srate16000_p0_vliegtuig_gebaren_c1_denoised.wav
53_2
Loading the audio
input(filename='f:\\desync_xdf\\data\\\\Data_processed\\Data_trials\\Audio\\53_2_pr_1_Mic_nominal_srate16000_p0_vliegtuig_gebaren_c1_denoised.wav')[None] <79a3e28a6795>
Loading the video
input(filename='f:\\desync_xdf\\data\\Data_processed\\Data_trials\\53_2_pr_1_p0_vliegtuig_gebaren_c1_video_raw.avi')[None] <8ee576