Created on Tue Aug 27 22:21:22 2024

@author: Santiago D'hers

Use:

- This script will use .H5 files (from DeepLabCut) to prepare the position.csv files to be analyzed

- It filters out low likelihood positions, interpolates and smoothens the data

- The positions are scaled from pixels to cm for better generalization


Requirements:

- A folder with files of extention .H5 (from DeepLabCut) containing:

    - The position of the desired bodyparts and objects on the video

In [1]:
import os
import pandas as pd
import numpy as np

import plotly.graph_objects as go

import shutil
import random

from scipy import signal

In [2]:
# State your path:
path = r'C:\Users\dhers\OneDrive - UBA\workshop'
experiment = r'cdln'

folder = os.path.join(path, experiment)
trials  = ["Hab", "TR1", "TR2", "TS"] # your filenames must contain the group name, these groups will be used to organize the files into folders 

tolerance = 'mean' # State the likelihood limit under which the coordenate will be erased. If 'mean' or 'median, the tolerance will be calculated separately for each bodypart
certainty = 3 # Between 0 and 5, say how many std_dev away from the mean the points should be erased (it is similar to asking "how good is your tracking?")
drop = 0.5 # State the drop in likelihood for the points to be erased from the dataframe

bodypart = 'nose' # State which bodypart you'd like to plot as an example
objects = ['obj_1', 'obj_2'] # Name the stationary objects that may appear on your data

ear_dist = 1.8 # State the distance between the ears
video_fps = 25 # State the frames per second

In [3]:
h5_files = [file for file in os.listdir(folder) if file.endswith('position.h5') and 'TS' in file] 

if not h5_files:
    print("No files found")

else:
    # Choose one file at random to use as example
    example = random.choice(h5_files)
    example_path = os.path.join(folder, example)
    print(f"Plotting coordinates from {example}")

No files found


In [5]:
df = pd.read_hdf(example_path)
scorer = df.columns.levels[0][0]
bodyparts = df.columns.levels[1].to_list()
df = df[scorer]

print(f"Positions obtained by model: {scorer}")
print(f"Points in df: {bodyparts}")

Positions obtained by model: DLC_Resnet50_STORM_2Oct14shuffle2_snapshot_900
Points in df: ['L_ear', 'R_ear', 'body', 'head', 'neck', 'nose', 'obj_1', 'obj_2', 'tail_1', 'tail_2', 'tail_3']


In [6]:
df_raw = pd.DataFrame()

for key in df.keys():
    df_raw[str(key[0]) + "_" + str(key[1])] = df[key]

for point in bodyparts:
    median = df_raw[f'{point}_likelihood'].median()
    mean = df_raw[f'{point}_likelihood'].mean()
    std_dev = df_raw[f'{point}_likelihood'].std()
    print(f'{point} \t median: {median:.2f} \t mean: {mean:.2f} \t std_dev: {std_dev:.2f} \t tolerance: {mean - certainty*std_dev:.2f}')
    

L_ear 	 median: 0.86 	 mean: 0.83 	 std_dev: 0.13 	 tolerance: 0.44
R_ear 	 median: 0.85 	 mean: 0.82 	 std_dev: 0.12 	 tolerance: 0.47
body 	 median: 0.67 	 mean: 0.65 	 std_dev: 0.13 	 tolerance: 0.26
head 	 median: 0.89 	 mean: 0.85 	 std_dev: 0.13 	 tolerance: 0.44
neck 	 median: 0.89 	 mean: 0.86 	 std_dev: 0.12 	 tolerance: 0.51
nose 	 median: 0.91 	 mean: 0.82 	 std_dev: 0.21 	 tolerance: 0.19
obj_1 	 median: 0.05 	 mean: 0.09 	 std_dev: 0.10 	 tolerance: -0.22
obj_2 	 median: 0.05 	 mean: 0.06 	 std_dev: 0.06 	 tolerance: -0.10
tail_1 	 median: 0.92 	 mean: 0.90 	 std_dev: 0.10 	 tolerance: 0.60
tail_2 	 median: 0.64 	 mean: 0.63 	 std_dev: 0.11 	 tolerance: 0.29
tail_3 	 median: 0.95 	 mean: 0.91 	 std_dev: 0.15 	 tolerance: 0.45


In [7]:
def filter_and_smooth_df(data, points, stat_points = [], llhd_lim = 'mean', num_std = 2, drop_below = 0.5):

    df = data.copy()

    # Try different filtering parameters
    med_filt_window = 3
    sigma, n_sigmas = 0.6, 2
    N = int(2 * n_sigmas * sigma + 1)

    # Gaussian kernel
    gauss_kernel = signal.windows.gaussian(N, sigma)
    gauss_kernel = gauss_kernel / sum(gauss_kernel)

    pad_width = (len(gauss_kernel) - 1) // 2

    for point in points:

        median = df[f'{point}_likelihood'].median()
        mean = df[f'{point}_likelihood'].mean()
        std_dev = df[f'{point}_likelihood'].std()
            
        if llhd_lim == 'mean':
            limit = mean - num_std*std_dev
        elif llhd_lim == 'median':
            limit = median - num_std*std_dev
        else:
            limit = llhd_lim

        # Set x and y coordinates to NaN where the likelihood is below the tolerance limit
        df.loc[df[f'{point}_likelihood'] < limit, [f'{point}_x', f'{point}_y']] = np.nan
        
        for axis in ['x','y']:
            column = f'{point}_{axis}'

            # Interpolate using the pchip method
            df[column] = df[column].interpolate(method='pchip', limit_area='inside')
            
            # Forward fill the remaining NaN values
            df[column] = df[column].ffill()
            
            # Apply median filter
            df[column] = signal.medfilt(df[column], kernel_size = med_filt_window)
            
            # Pad the median filtered data to mitigate edge effects
            padded = np.pad(df[column], pad_width, mode='edge')
            
            # Apply convolution
            smooth = signal.convolve(padded, gauss_kernel, mode='valid')
            
            # Trim the padded edges to restore original length
            df[column] = smooth[:len(df[column])]

            for obj in stat_points:
                if obj in column:
                    df[column] = df[column].median()

        # If the likelihood of an object is too low, probably the object is not there. Lets drop those columns
        if median < drop_below:
            df.drop([f'{point}_x', f'{point}_y', f'{point}_likelihood'], axis=1, inplace=True)
        
    return df

In [8]:
df_smooth = filter_and_smooth_df(df_raw, bodyparts, objects, llhd_lim = tolerance, num_std = certainty, drop_below = drop)

In [9]:
# Create figure
fig = go.Figure()

# Add traces for raw data
for column in df_raw.columns:
    if bodypart in column:
        if 'likelihood' not in column:
            fig.add_trace(go.Scatter(x=df_raw.index, y=df_raw[column], mode='markers', name=f'raw {column}', marker=dict(symbol='x', size=6)))
        elif '_y' not in column:
            fig.add_trace(go.Scatter(x=df_raw.index, y=df_raw[column], name=f'{column}', line=dict(color='black', width=3), yaxis='y2',opacity=0.5))

# Add traces for smoothed data
for column in df_smooth.columns:
    if bodypart in column:
        if 'likelihood' not in column:
            fig.add_trace(go.Scatter(x=df_smooth.index, y=df_smooth[column], name=f'new {column}', line=dict(width=3)))

median = df_raw[f'{bodypart}_likelihood'].median()
mean = df_raw[f'{bodypart}_likelihood'].mean()
std_dev = df_raw[f'{bodypart}_likelihood'].std()
    
if tolerance == 'mean':
    limit = mean - certainty*std_dev
elif tolerance == 'median':
    limit = median - certainty*std_dev
else:
    limit = tolerance

# Update layout for secondary y-axis
fig.update_layout(
    xaxis=dict(title='Video frame'),
    yaxis=dict(title=f'{bodypart} position (pixels)'),
    yaxis2=dict(title=f'{bodypart} likelihood', 
                overlaying='y', 
                side='right',
                gridcolor='black'),
    title=f'{bodypart} position & likelihood',
    width=900,
    height=300,
    legend=dict(yanchor="bottom",
                y=1.03,
                xanchor="center",
                x=0.7,
                orientation="h"),
    shapes=[dict(type='line', 
                 x0=df_raw.index.min(), 
                 x1=df_raw.index.max(), 
                 y0=limit, 
                 y1=limit, 
                 line=dict(color='orange', dash='solid'),
                 yref='y2')],
    margin=dict(t=50, b=0, l=0, r=0),  # Adjust margins for better layout
    showlegend=True,
    # font=dict(size=15)
)

# Show plot
fig.show()

In [10]:
# Plot the distance between ears
df_smooth.dropna(inplace=True)

# Calculate the distance between ears
dist = np.sqrt(
    (df_smooth['L_ear_x'] - df_smooth['R_ear_x'])**2 + 
    (df_smooth['L_ear_y'] - df_smooth['R_ear_y'])**2)

dist.dropna(inplace=True)

# Calculate the mean and median
mean_dist = np.mean(dist)
median_dist = np.median(dist)

scale = (1.8 / median_dist)

print(f'median distance is {median_dist}, mean distance is {mean_dist}. scale is {scale*100}')

# Create the plot
fig = go.Figure()

# Add the distance trace
fig.add_trace(go.Scatter(y=dist, mode='lines', name='Distance between ears'))

# Add mean and median lines
fig.add_trace(go.Scatter(y=[mean_dist]*len(dist), mode='lines', name=f'Mean: {mean_dist:.2f}', line=dict(color='red', dash='dash')))
fig.add_trace(go.Scatter(y=[median_dist]*len(dist), mode='lines', name=f'Median: {median_dist:.2f}', line=dict(color='black')))

# Update layout
fig.update_layout(
    title='Distance between ears',
    xaxis_title='Frame',
    yaxis_title='Distance (pixels)',
    width=720,
    height=480,
    legend=dict(
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        orientation="h")
)

# Show the plot
fig.show()

median distance is 31.093475064480344, mean distance is 30.409718788197985. scale is 5.788995910773034


In [19]:
def process_hdf5_file(path, stat_points = [], distance = 1.8, fps = 30, llhd_lim = 'mean', num_std = 2, drop_below = 0.5):
    
    # List all files in the folder
    h5_files = [file for file in os.listdir(path) if file.endswith('.h5')]
    
    for h5_file in h5_files:
        
        h5_file_path = os.path.join(path, h5_file)

        df = pd.read_hdf(h5_file_path)
        scorer = df.columns.levels[0][0]
        bodyparts = df.columns.levels[1].to_list()
        df = df[scorer]

        df_raw = pd.DataFrame()

        for key in df.keys():
            df_raw[str(key[0]) + "_" + str(key[1])] = df[key]

        df_smooth = filter_and_smooth_df(df_raw, bodyparts, stat_points, llhd_lim, num_std, drop_below)

        # Drop the likelihood columns
        df_smooth = df_smooth.drop(columns=df_smooth.filter(like='likelihood').columns)

        # Drop the frames when the mouse is not in the video
        df_smooth.dropna(inplace=True)
        
        # Calculate the mean distance between ears
        dist = np.sqrt(
            (df_smooth['L_ear_x'] - df_smooth['R_ear_x'])**2 + 
            (df_smooth['L_ear_y'] - df_smooth['R_ear_y'])**2)
        
        dist.dropna(inplace=True)
        median_dist = dist.median()

        # As the distance between ears is a constant that can be measured in real life, we can use it to scale different sized videos into the same size.
        scale = (distance / median_dist)
        df_smooth = df_smooth * scale            
        
        # Determine the output file path in the same directory as the input file
        # Split the path and filename
        input_dir, input_filename = os.path.split(h5_file_path)
        
        # Remove the original extension
        filename_without_extension = os.path.splitext(input_filename)[0]
        
        # Add the new extension '.csv'
        output_csv_path = os.path.join(input_dir, filename_without_extension + '.csv')
    
        # Save the processed data as a CSV file
        df_smooth.to_csv(output_csv_path, index=False)
        
        # Calculate the moment when the mouse enters the video
        mouse_enters = (len(df_raw) - len(df_smooth)) / fps

        print(f"{input_filename} has {df_smooth.shape[1]} columns. The mouse took {mouse_enters:.2f} sec to enter. scale is {scale*100:.2f}.")

In [20]:
process_hdf5_file(folder, objects, distance = ear_dist, fps = video_fps, llhd_lim = tolerance, num_std = certainty, drop_below = drop)

example_cdln.h5 has 22 columns. The mouse took 0.00 sec to enter. scale is 5.07.


In [12]:
def filter_and_move_files(folder, subfolders):
    """
    This function moves all files that have a word on its name to a subfolder.
    """
    for subfolder in subfolders:
        # Create a new subfolder
        output_folder = os.path.join(folder, subfolder, "position")
        os.makedirs(output_folder, exist_ok=True)

        # Get a list of all files in the input folder
        files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

        # Iterate through files, move those without the word "position" to the "extra" subfolder
        for file in files:
            if subfolder in file and ".csv" in file and "filtered" not in file:
                file_path = os.path.join(folder, file)
                output_path = os.path.join(output_folder, file)

                # Move the file to the "extra" subfolder
                shutil.move(file_path, output_path)

    print("Files filtered and moved successfully.")

In [13]:
filter_and_move_files(folder, trials)

Files filtered and moved successfully.


In [14]:
def clean_folder(folder):
    """
    cleans all other files in the folder into a subfolder
    """
    subfolder = os.path.join(folder, "h5 files & others")
    os.makedirs(subfolder, exist_ok=True)
        
    # Get a list of all files in the input folder
    other_files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

    # Iterate through files, move those without the word "position" to the "extra" subfolder
    for file in other_files:
        file_path = os.path.join(folder, file)
        output_path = os.path.join(subfolder, file)
        
        # Move the file to the "extra" subfolder
        shutil.move(file_path, output_path)

    print("All .H5 files are stored away")

In [15]:
clean_folder(folder)

All .H5 files are stored away
