# Dataframes Creation

This notebook does the following: 
- Reads all the dvrk_pedals.bag, and dvrk_kinematic.bag files.
- Preprocess them (plese refer to our paper for more details).
- Merges them in a single Dataframe and do a small analysis of it.

## Files preparation
To use this notebook please create a folder called bag_folder and copy inside of it each of the dvrk_pedals.bag, and dvrk_kinematic.bag. It will then be necessary to rename them. Do it as in following example:

- Original folder: G_2.
- Original .bag files names: dvrk_pedals.bag, dvrk_kinematics.bag.
- Renamed .bag files: dvrk_pedals_G2.bag, dvrk_kinematics_G2.bag.
- Then rename and copy inside the bag_folder the rest of the .bag files for all the procedures.

Finally create another folder called df_surgeons.



In [1]:
import pandas as pd
import bagpy
from bagpy import bagreader
import numpy as np
import os

In [2]:
def sync_df(data,df1,df2):
    
    true_indices = df1.index[df1[data] == True]
    df2[data] = 0
    
    true_time=[]
    true_data_index=[]
    eps=0.006

    for i in true_indices:
        time=df1['Time'][i]
        true_time.append(time)
        true_data_index.append(df2.index[abs(df2['Time']-time) < eps])
    
    idx=[]
    
    for i in true_data_index: 
        try:   
                df2.at[i[0],data]=1
                idx.append(i[0])
        except:
                pass

    df2.loc[idx, [data]]=1
    return df2

In [3]:
def processing(bag_kinematic,bag_pedals,df_name):

    print("processing file: ", bag_kinematic)
    b = bagreader(bag_kinematic)
    print(b.topic_table)
    
    csvfiles = []
    
    for t in b.topics:
        data = b.message_by_topic(t)
        csvfiles.append(data)
        
    print(csvfiles[0])
    
    #DATA LOADING
    data = pd.read_csv(csvfiles[12])
    data_PSM2 = pd.read_csv(csvfiles[16])
    data_ECM=pd.read_csv(csvfiles[1])
    data_MTML=pd.read_csv(csvfiles[5])
    data_MTMR=pd.read_csv(csvfiles[9])

    
    print("renaming columns")
    #Renaming some columns for more clarity
    print("psm1")
    data.rename(columns={"transform.translation.x": "PSM1_transform.translation.x"}, inplace=True)
    data.rename(columns={"transform.translation.y": "PSM1_transform.translation.y"}, inplace=True)
    data.rename(columns={"transform.translation.z": "PSM1_transform.translation.z"}, inplace=True)
    data.rename(columns={"transform.rotation.x": "PSM1_transform.rotation.x"}, inplace=True)
    data.rename(columns={"transform.rotation.y": "PSM1_transform.rotation.y"}, inplace=True)
    data.rename(columns={"transform.rotation.z": "PSM1_transform.rotation.z"}, inplace=True)
    data.rename(columns={"transform.rotation.w": "PSM1_transform.rotation.w"}, inplace=True)
    
    data=data.drop(columns=['header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id','child_frame_id'])

    

    
    print("psm2")
    data_PSM2.rename(columns={"transform.translation.x": "PSM2_transform.translation.x"}, inplace=True)
    data_PSM2.rename(columns={"transform.translation.y": "PSM2_transform.translation.y"}, inplace=True)
    data_PSM2.rename(columns={"transform.translation.z": "PSM2_transform.translation.z"}, inplace=True)
    data_PSM2.rename(columns={"transform.rotation.x": "PSM2_transform.rotation.x"}, inplace=True)
    data_PSM2.rename(columns={"transform.rotation.y": "PSM2_transform.rotation.y"}, inplace=True)
    data_PSM2.rename(columns={"transform.rotation.z": "PSM2_transform.rotation.z"}, inplace=True)
    data_PSM2.rename(columns={"transform.rotation.w": "PSM2_transform.rotation.w"}, inplace=True)
    
    data_PSM2=data_PSM2.drop(columns=['Time','header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id','child_frame_id'])

    print("mtml")
    data_MTML.rename(columns={"transform.translation.x": "MTML_transform.translation.x"}, inplace=True)
    data_MTML.rename(columns={"transform.translation.y": "MTML_transform.translation.y"}, inplace=True)
    data_MTML.rename(columns={"transform.translation.z": "MTML_transform.translation.z"}, inplace=True)
    data_MTML.rename(columns={"transform.rotation.x": "MTML_transform.rotation.x"}, inplace=True)
    data_MTML.rename(columns={"transform.rotation.y": "MTML_transform.rotation.y"}, inplace=True)
    data_MTML.rename(columns={"transform.rotation.z": "MTML_transform.rotation.z"}, inplace=True)
    data_MTML.rename(columns={"transform.rotation.w": "MTML_transform.rotation.w"}, inplace=True)
    
    data_MTML=data_MTML.drop(columns=['Time','header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id','child_frame_id'])

    print("mtmr")
    data_MTMR.rename(columns={"transform.translation.x": "MTMR_transform.translation.x"}, inplace=True)
    data_MTMR.rename(columns={"transform.translation.y": "MTMR_transform.translation.y"}, inplace=True)
    data_MTMR.rename(columns={"transform.translation.z": "MTMR_transform.translation.z"}, inplace=True)
    data_MTMR.rename(columns={"transform.rotation.x": "MTMR_transform.rotation.x"}, inplace=True)
    data_MTMR.rename(columns={"transform.rotation.y": "MTMR_transform.rotation.y"}, inplace=True)
    data_MTMR.rename(columns={"transform.rotation.z": "MTMR_transform.rotation.z"}, inplace=True)
    data_MTMR.rename(columns={"transform.rotation.w": "MTMR_transform.rotation.w"}, inplace=True)
    
    data_MTMR=data_MTMR.drop(columns=['Time','header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id','child_frame_id'])

    print("ecm")
    data_ECM.rename(columns={"transform.translation.x": "ECM_transform.translation.x"}, inplace=True)
    data_ECM.rename(columns={"transform.translation.y": "ECM_transform.translation.y"}, inplace=True)
    data_ECM.rename(columns={"transform.translation.z": "ECM_transform.translation.z"}, inplace=True)
    data_ECM.rename(columns={"transform.rotation.x": "ECM_transform.rotation.x"}, inplace=True)
    data_ECM.rename(columns={"transform.rotation.y": "ECM_transform.rotation.y"}, inplace=True)
    data_ECM.rename(columns={"transform.rotation.z": "ECM_transform.rotation.z"}, inplace=True)
    data_ECM.rename(columns={"transform.rotation.w": "ECM_transform.rotation.w"}, inplace=True)
    
    data_ECM=data_ECM.drop(columns=['Time','header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id','child_frame_id'])
    

    frames = [data, data_PSM2,data_ECM,data_MTML,data_MTMR]
    data_kinematic=pd.concat(frames,axis=1)
    

    
    # exctracting and sync. the pedals .bag folder
    print("processing pedals: ",bag_pedals)
    try:
        bag_pedals = bagreader(bag_pedals)
        
        print(bag_pedals.topic_table)
        
        topic1 = '/pedals/camera'
        topic2 = '/pedals/clutch'
        topic4_1 = '/pedals/read/monopolar'
        topic4_2 = '/pedals/monopolar'
    
        csvfiles = []
    
        data_camera = bag_pedals.message_by_topic(topic1)
        data_pedal = bag_pedals.message_by_topic(topic2)
        data_monopolar = bag_pedals.message_by_topic(topic4_1)
    
        if data_monopolar is None:
            # If data_monopolar is not found, try the alternative topic
            data_monopolar = bag_pedals.message_by_topic(topic4_2)
        
        dataframe_camera = pd.read_csv(data_camera)
        dataframe_pedal= pd.read_csv(data_pedal)
        dataframe_monopolar=pd.read_csv(data_monopolar)
        #dataframe_bipolar=pd.read_csv(data_bipolar)
        
        dataframe_camera.rename(columns={"data": "data_camera"}, inplace=True)
        dataframe_pedal.rename(columns={"data": "data_clutch"}, inplace=True)
        dataframe_pedal=dataframe_pedal.drop(columns=['Time','header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id'])
        dataframe_monopolar.rename(columns={"data": "data_monopolar"}, inplace=True)
        dataframe_monopolar=dataframe_monopolar.drop(columns=['Time','header.seq','header.stamp.secs','header.stamp.nsecs','header.frame_id'])
        
        
        frames = [dataframe_camera, dataframe_pedal,dataframe_monopolar]#,dataframe_bipolar]
        pedals_dataframe=pd.concat(frames,axis=1)
        
        data=sync_df('data_clutch',pedals_dataframe,data_kinematic)
        data=sync_df('data_camera',pedals_dataframe,data_kinematic)
        data=sync_df('data_monopolar',pedals_dataframe,data_kinematic)
        
        first_value = data['Time'].iloc[0]
        data['Time'] = data['Time'] - first_value 
    except: 
        pass
    
    # VISUALIZING ALL COLUMNS AND SAVING THE FILE TO CSV
    
    for col in data.columns:
        print(col)
    print("dataset created with the name: ", df_name)
    data.to_csv(df_name, index=False)  
    


In [4]:
def extract_last_word_after_underscore(file_name):
    # Split the filename by underscore (_) and get the last part
    parts = file_name.split('_')
    return parts[-1] if len(parts) >= 2 else None

def list_words_after_last_underscore_in_folder(folder_path):
    # Check if the path is a directory
    if os.path.isdir(folder_path):
        # Get a list of all files in the directory
        files = os.listdir(folder_path)

        # Extract the word after the last underscore from each filename
        words_after_last_underscore = [extract_last_word_after_underscore(file) for file in files]

        # Filter out None values, duplicates, and those not ending with ".bag"
        words_after_last_underscore = set(word for word in words_after_last_underscore if word and word.endswith(".bag"))
        return list(words_after_last_underscore)
    else:
        print(f"{folder_path} is not a valid directory.")

# Replace 'your_folder_path' with the path to the folder you want to list
folder_path = ''
words_list = list_words_after_last_underscore_in_folder(folder_path)

if words_list:
    print("List of Valid names found:")
    for word in words_list:
        print(word)
else:
    print("No valid words found after the last underscore ending with '.bag' in the specified folder.")


List of unique words after the last underscore ending with '.bag' in the filenames:
AD2.bag
VL2.bag
MDP1.bag
MDP3.bag
LA2.bag
GP2.bag
LA3.bag
GP1.bag
AM3.bag
AM2.bag
VL3.bag
AD1.bag
FT3.bag
GP3.bag
MDP2.bag
VL1.bag
LA1.bag
AD3.bag
FT2.bag
FT1.bag
AM1.bag


In [5]:
def generate_strings(element):

    if element.endswith(".bag"):
        
        # Generate the three strings, specify the path to the bag folder for string1 and 2, and for string3 specify the folder where you
        # want to save the dataframes
        
        string1 = f".../bag_folder/dvrk_pedals_{element}"
        string2 = f".../bag_folder/dvrk_kinematics_{element}"
        string3 = f".../df_{element}"
        string3=string3.replace(".bag", ".csv")

        return string1, string2, string3
        
    else:
        print(f"Invalid element: {element}")

words_list = list_words_after_last_underscore_in_folder(folder_path)

if words_list:
    for word in words_list:
        strings = generate_strings(word)
        processing(strings[1],strings[0],strings[2])
        print(strings[0])
else:
    print("No valid words found after the last underscore ending with '.bag' in the specified folder.")


processing file:  /home/phd-leonardo-sitl/Desktop/PhdLeo/AutoClutch/bag_folder/dvrk_kinematics_AD2.bag
[INFO]  Data folder /home/phd-leonardo-sitl/Desktop/PhdLeo/AutoClutch/bag_folder/dvrk_kinematics_AD2 already exists. Not creating.
                            Topics                           Types  \
0    /ECM/custom/local/setpoint_cp  geometry_msgs/TransformStamped   
1          /ECM/custom/setpoint_cp  geometry_msgs/TransformStamped   
2                 /ECM/measured_js          sensor_msgs/JointState   
3        /MTML/gripper/measured_js          sensor_msgs/JointState   
4          /MTML/local/measured_cp  geometry_msgs/TransformStamped   
5                /MTML/measured_cp  geometry_msgs/TransformStamped   
6                /MTML/measured_js          sensor_msgs/JointState   
7        /MTMR/gripper/measured_js          sensor_msgs/JointState   
8          /MTMR/local/measured_cp  geometry_msgs/TransformStamped   
9                /MTMR/measured_cp  geometry_msgs/TransformStamped

# Merging and Dataset Analysis

In [None]:
def dataset_analysis(large_dataset):
    print(large_dataset.columns)
    print("First few rows of the dataset:")
    print(large_dataset.head())
    print("Number of rows and columns:", large_dataset.shape)
    print("Memory usage per column:")
    print(large_dataset.memory_usage(deep=True))
    print("Total memory usage of the DataFrame:", large_dataset.memory_usage(deep=True).sum() / (1024 * 1024), "MB")

    print("\nSummary statistics of numerical columns:")
    print(large_dataset.describe())
    print("\nInformation about the dataset:")
    print(large_dataset.info())
    print("\nMissing values in the dataset:")
    print(large_dataset.isnull().sum())
    print("\nUnique values in categorical columns:")

    
    for column in large_dataset.select_dtypes(include='object').columns:
        print(f"{column}: {large_dataset[column].nunique()} unique values")
    
    correlation_matrix = large_dataset.corr()
    print("\nCorrelation matrix:")
    print(correlation_matrix)
    
    
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Matrix")
    plt.show()


In [None]:
# Specify the folder path where the datasets are saved
folder_path = ".../df_surgeons"


file_names = glob.glob(folder_path + '/*.csv')

# Print the file names to check if they are being found
print("File Names:", file_names)
dfs = []
for file_name in file_names:
    df = pd.read_csv(file_name, delimiter=',') 
    
    if df.shape[1==39]:
        dfs.append(df)    
        print(df.shape)



# Concatenate
large_dataset = pd.concat(dfs, ignore_index=True)

# Specify the folder path where you want to save the final dataset
large_dataset.to_csv('.../df_surgeons/df_total.csv', index=False)