Created on Tue Aug 27 22:21:22 2024

@author: Santiago D'hers

Use:

- This script will use .H5 files to prepare the .csv files with the positions to be analyzed

- The positions are scaled from pixels to cm for better generalization


Requirements:

- A folder with files of extention .H5 (from DeepLabCut)

- H5 files must have the position of the desired bodyparts (and objects)

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go

import shutil
import random

from scipy import signal
from collections import OrderedDict

In [5]:
# State your path:
path = r'C:\Users\dhers\OneDrive - UBA\workshop'
experiment = r'b2-mix'

folder = os.path.join(path, experiment)

groups  = ["TS"]

llhd_tol = 'mean' # State the likelihood limit under which the coordenate will be erased. If 'mean', the tolerance is set to the mean of the likelihood.
num_std = 2 # Between 0 and 5, say how many std_dev away from the mean the points should be erased (it is similar to asking "how good is your tracking?")
smooth_window = 3 # The size of the median filter window (it has to be an odd number)

bodypart = 'nose' # State which bodypart you'd like to plot
stationary_points = ['obj_1', 'obj_2'] # Name the stationary objects that may appear on your data

ear_dist = 1.8 # State the distance between the ears
video_fps = 25 # State the frames per second

In [6]:
h5_files = [file for file in os.listdir(folder) if file.endswith('position.h5') and 'TS' in file] 

if not h5_files:
    print("No files found")

else:
    # Choose one file at random to use as example
    example = random.choice(h5_files)
    example_path = os.path.join(folder, example)
    print(f"Plotting coordinates from {example}")

Plotting coordinates from 2024-08_TORM-Tg-6m_TS_C2_B_R_position.h5


In [7]:
df = pd.read_hdf(example_path)
scorer = df.columns.levels[0][0]
bodyparts = df.columns.levels[1].to_list()
df = df[scorer]

print(f"Positions obtained by model: {scorer}")
print(f"Points in df: {bodyparts}")

Positions obtained by model: DLC_resnet50_VaderDec1shuffle1_200000
Points in df: ['L_ear', 'R_ear', 'body', 'head', 'neck', 'nose', 'obj_1', 'obj_2', 'tail_1', 'tail_2', 'tail_3']


In [8]:
df_raw = pd.DataFrame()

for key in df.keys():
    df_raw[str(key[0]) + "_" + str(key[1])] = df[key]

In [16]:
def filter_and_smooth_df(data, points, stat_points = [], tolerance = 'mean', stride = 2, median_filt = 3):

    df = data.copy()

    # Try different filtering parameters
    window, sigma, n_sigmas = median_filt, 1, 2
    N = int(2 * n_sigmas * sigma + 1)

    # Gaussian kernel
    kernel = signal.windows.gaussian(N, sigma)
    kernel = kernel / sum(kernel)

    pad_width = (len(kernel) - 1) // 2

    for point in points:

        median = df[f'{point}_likelihood'].median()
        mean = df[f'{point}_likelihood'].mean()
        std_dev = df[f'{point}_likelihood'].std()
            
        if tolerance == 'mean':
            limit = mean - stride*std_dev
        elif tolerance == 'median':
            limit = median - stride*std_dev
        else:
            limit = tolerance

        # Set x and y coordinates to NaN where the likelihood is below the tolerance limit
        df.loc[df[f'{point}_likelihood'] < limit, [f'{point}_x', f'{point}_y']] = np.nan
        
        for axis in ['x','y']:
            column = f'{point}_{axis}'

            # Interpolate using the pchip method
            df[column] = df[column].interpolate(method='pchip', limit_area='inside')
            
            # Forward fill the remaining NaN values
            # df[column] = df[column].ffill()
            
            # Apply median filter
            df[column] = signal.medfilt(df[column], kernel_size=window)
            
            # Pad the median filtered data to mitigate edge effects
            padded = np.pad(df[column], pad_width, mode='edge')
            
            # Apply convolution
            smooth = signal.convolve(padded, kernel, mode='valid')
            
            # Trim the padded edges to restore original length
            df[column] = smooth[:len(df[column])]

            for obj in stat_points:
                if obj in column:
                    df[column] = df[column].median()

        # If the likelihood of an object is too low, probably the object is not there. Lets drop those columns
        # print(median - std_dev)
        """
        if (mean - stride*std_dev) < 0:
            df.drop([f'{point}_x', f'{point}_y', f'{point}_likelihood'], axis=1, inplace=True)
        """
    return df

In [17]:
df_smooth = filter_and_smooth_df(df_raw, bodyparts, stationary_points, tolerance = llhd_tol, stride = num_std, median_filt = smooth_window)

In [18]:
# Create figure
fig = go.Figure()

# Add traces for raw data
for column in df_raw.columns:
    if bodypart in column:
        if 'likelihood' not in column:
            fig.add_trace(go.Scatter(x=df_raw.index, y=df_raw[column], mode='markers', name=f'raw {column}', marker=dict(symbol='circle', size=5)))
        else:
            fig.add_trace(go.Scatter(x=df_raw.index, y=df_raw[column], name=f'{column}', line=dict(color='black'), yaxis='y2',opacity=0.3))

# Add traces for smoothed data
for column in df_smooth.columns:
    if bodypart in column:
        if 'likelihood' not in column:
            fig.add_trace(go.Scatter(x=df_smooth.index, y=df_smooth[column], mode='markers', name=f'new {column}', marker=dict(symbol='x', size=3)))

if llhd_tol == 'mean':
    mean = df_raw[f'{bodypart}_likelihood'].mean()
    std_dev = df_raw[f'{bodypart}_likelihood'].std()
    limit = max(0.1, mean - num_std*std_dev)
else:
    limit = llhd_tol

# Update layout for secondary y-axis
fig.update_layout(
    xaxis=dict(title='Video frame'),
    yaxis=dict(title=f'{bodypart} position (pixels)'),
    yaxis2=dict(title=f'{bodypart} likelihood', 
                overlaying='y', 
                side='right',
                gridcolor='black'),
    title=f'{bodypart} position & likelihood',
    width=1024,
    height=512,
    legend=dict(yanchor="bottom",
                y=1.02,
                xanchor="center",
                x=0.5,
                orientation="h"),
    shapes=[dict(type='line', 
                 x0=df_raw.index.min(), 
                 x1=df_raw.index.max(), 
                 y0=limit, 
                 y1=limit, 
                 line=dict(color='red', dash='solid'),
                 yref='y2')]
)

# Show plot
fig.show()

In [19]:
df_smooth.dropna(inplace=True)

# Calculate the distance between ears
dist = np.sqrt(
    (df_smooth['L_ear_x'] - df_smooth['R_ear_x'])**2 + 
    (df_smooth['L_ear_y'] - df_smooth['R_ear_y'])**2)

dist.dropna(inplace=True)

# Calculate the mean and median
mean_dist = np.mean(dist)
median_dist = np.median(dist)

scale = (1.8 / median_dist)

print(f'median distance is {median_dist}, mean distance is {mean_dist}. scale is {scale*100}')

# Create the plot
fig = go.Figure()

# Add the distance trace
fig.add_trace(go.Scatter(y=dist, mode='lines', name='Distance between ears'))

# Add mean and median lines
fig.add_trace(go.Scatter(y=[mean_dist]*len(dist), mode='lines', name=f'Mean: {mean_dist:.2f}', line=dict(color='red', dash='dash')))
fig.add_trace(go.Scatter(y=[median_dist]*len(dist), mode='lines', name=f'Median: {median_dist:.2f}', line=dict(color='black')))

# Update layout
fig.update_layout(
    title='Distance between ears',
    xaxis_title='Frame',
    yaxis_title='Distance (pixels)',
    width=720,
    height=480,
    legend=dict(
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        orientation="h")
)

# Show the plot
fig.show()

median distance is 37.364324734286996, mean distance is 37.57827402007402. scale is 4.817429494044217


In [20]:
def process_hdf5_file(path, stat_points = [], distance = 1.8, fps = 30, tolerance = 'mean', stride = 2, median_filt = 3):
    
    # List all files in the folder
    h5_files = [file for file in os.listdir(path) if file.endswith('_position.h5')]
    
    for h5_file in h5_files:
        
        h5_file_path = os.path.join(path, h5_file)

        df = pd.read_hdf(h5_file_path)
        scorer = df.columns.levels[0][0]
        bodyparts = df.columns.levels[1].to_list()
        df = df[scorer]

        df_raw = pd.DataFrame()

        for key in df.keys():
            df_raw[str(key[0]) + "_" + str(key[1])] = df[key]

        df_smooth = filter_and_smooth_df(df_raw, bodyparts, stat_points, tolerance, stride, median_filt)

        # Drop the likelihood columns
        df_smooth = df_smooth.drop(columns=df_smooth.filter(like='likelihood').columns)

        # Drop the frames when the mouse is not in the video
        df_smooth.dropna(inplace=True)
        
        # Calculate the mean distance between ears
        dist = np.sqrt(
            (df_smooth['L_ear_x'] - df_smooth['R_ear_x'])**2 + 
            (df_smooth['L_ear_y'] - df_smooth['R_ear_y'])**2)
        
        dist.dropna(inplace=True)
        median_dist = dist.median()

        # As the distance between ears is a constant that can be measured in real life, we can use it to scale different sized videos into the same size.
        scale = (distance / median_dist)
        df_smooth = df_smooth * scale            
        
        # Determine the output file path in the same directory as the input file
        # Split the path and filename
        input_dir, input_filename = os.path.split(h5_file_path)
        
        # Remove the original extension
        filename_without_extension = os.path.splitext(input_filename)[0]
        
        # Add the new extension '.csv'
        output_csv_path = os.path.join(input_dir, filename_without_extension + '.csv')
    
        # Save the processed data as a CSV file
        df_smooth.to_csv(output_csv_path, index=False)
        
        # Calculate the moment when the mouse enters the video
        mouse_enters = (len(df_raw) - len(df_smooth)) / fps

        print(f"{input_filename} has {df_smooth.shape[1]} columns. The mouse took {mouse_enters:.2f} sec to enter. scale is {scale*100:.2f}.")

In [21]:
process_hdf5_file(folder, stationary_points, distance = ear_dist, fps = video_fps, tolerance = llhd_tol, stride = num_std, median_filt = smooth_window)

2024-04_TORM-Tg-2m_TS_C1_A_L_position.h5 has 22 columns. The mouse took 1.08 sec to enter. scale is 4.94.
2024-04_TORM-Tg-2m_TS_C1_A_R_position.h5 has 22 columns. The mouse took 0.00 sec to enter. scale is 4.75.
2024-04_TORM-Tg-2m_TS_C1_B_L_position.h5 has 22 columns. The mouse took 2.48 sec to enter. scale is 4.98.
2024-04_TORM-Tg-2m_TS_C1_B_R_position.h5 has 22 columns. The mouse took 1.76 sec to enter. scale is 5.28.
2024-04_TORM-Tg-2m_TS_C2_A_L_position.h5 has 22 columns. The mouse took 1.56 sec to enter. scale is 5.08.
2024-04_TORM-Tg-2m_TS_C2_A_R_position.h5 has 22 columns. The mouse took 0.00 sec to enter. scale is 4.98.
2024-04_TORM-Tg-2m_TS_C3_A_L_position.h5 has 22 columns. The mouse took 0.88 sec to enter. scale is 4.93.
2024-04_TORM-Tg-2m_TS_C3_A_R_position.h5 has 22 columns. The mouse took 2.32 sec to enter. scale is 5.02.
2024-04_TORM-Tg-2m_TS_C3_B_L_position.h5 has 22 columns. The mouse took 0.00 sec to enter. scale is 4.81.
2024-04_TORM-Tg-2m_TS_C4_B_L_position.h5 has 2

In [22]:
def filter_and_move_files(input_folder, word, folder_name):
    """
    This function moves all files that have a word on its name to a subfolder.
    """
    
    # Create a new subfolder
    output_folder = os.path.join(input_folder, folder_name, "position")
    os.makedirs(output_folder, exist_ok=True)

    # Get a list of all files in the input folder
    files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f))]

    # Iterate through files, move those without the word "position" to the "extra" subfolder
    for file in files:
        if word in file and ".csv" in file and "filtered" not in file:
            file_path = os.path.join(input_folder, file)
            output_path = os.path.join(output_folder, file)

            # Move the file to the "extra" subfolder
            shutil.move(file_path, output_path)

    print("Files filtered and moved successfully.")

In [23]:
for group in groups:
    filter_and_move_files(folder, group, group)

Files filtered and moved successfully.


In [24]:
"""
Lets also clean all other files in the folder into a subfolder
"""

subfolder = os.path.join(folder, "h5 files & others")
os.makedirs(subfolder, exist_ok=True)
    
# Get a list of all files in the input folder
other_files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

# Iterate through files, move those without the word "position" to the "extra" subfolder
for file in other_files:
    file_path = os.path.join(folder, file)
    output_path = os.path.join(subfolder, file)
    
    # Move the file to the "extra" subfolder
    shutil.move(file_path, output_path)

print("All .H5 files are stored away")

All .H5 files are stored away
