In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import SPTCata as spt
import sys, os, re
import pandas as pd
import scipy.stats as stats
import kuiper

# 2019-07-24 Quality of trajectories

In this notebook, I want to explore several aspects that are related to the quality of the data of the SPT.

## Long trajectories

I observed that the MSD at the longest time intervals drops to zero. I want to have a look at what's going on there.

In [None]:
# load all the trajectories
spt_rootdir = '/home/rcortini/work/CRG/projects/catadata/data'
subdirs = [
    '0_Tannic_Acid_6h',
    '1_Olaparib_R5020',
    '2_DMSO_R5020_Control',
    '3_EtOH_Nohormone_Control'
]
datadirs = ['%s/%s'%(spt_rootdir, d) for d in subdirs]

In [None]:
def load_spt(datadir, quality = None) :
    # load the SPT data file
    spt_fname = '%s/Spots in tracks statistics.txt'%(datadir)
    spt = pd.read_csv(spt_fname, sep='\t')
    
    # group by TRACK_ID
    spt_by_track = spt.groupby('TRACK_ID')

    # get quality of tracks and list of excluded tracks
    if quality is not None :
        track_statistics = pd.read_csv('%s/Track statistics.csv'%(datadir))
        idx_tracks_to_exclude = track_statistics.TRACK_MEAN_QUALITY < quality
        tracks_to_exclude = track_statistics[idx_tracks_to_exclude].TRACK_ID
    else :
        tracks_to_exclude = []
    
    # extract trajectories
    trajectories = []
    for track_id, track in spt_by_track :

        # skip tracks that did not have sufficiently high average quality
        if track_id in tracks_to_exclude :
            continue

        # extract x and y from the trajectory
        x = track['POSITION_X']
        y = track['POSITION_Y']

        # finally, append the current trajectory to the list of trajectories
        trajectories.append(np.array([x, y]).T)
    
    return trajectories

In [None]:
def load_experiments(datadirs, quality=None) :
    """
    Load all the experiments in the list of `datadirs`, which is provided by the user
    and will be scanned recursively.
    """
    
    # init the output data structure
    experiments = {}

    # iterate over the subdirectories. Notice that the variable `subdir` will contain only
    # the name of the subdirectory, not the full path of it.
    for datadir in datadirs :
        for d, subdirs, fs in os.walk(datadir) :

            # here we test the name of the directory. We use a regular expression to check
            # whether the name of the subdirectory contains a format "StackN_CellN".
            # print(d, sd, fs)
            for sd in subdirs :
                if re.match('Stack[0-9]+_Cell[0-9]+', sd) is None :
                    continue

                # if we get here, we passed the test
                subdir = '%s/%s'%(d,sd)
                experiments[subdir] = load_spt(subdir, quality=quality)
    return experiments

In [None]:
# load all the experiments in the batch provided
experiments = load_experiments(datadirs, quality = 50.0)

Now let's do a visual comparison of short and long trajectories.

In [None]:
min_length = 100
max_length = 20
long_trajectories = []
short_trajectories = []
for experiment, trajectories in experiments.items() :
    for trajectory in trajectories :
        if trajectory.shape[0] > min_length :
            long_trajectories.append(trajectory)
        if trajectory.shape[0] < max_length and len(short_trajectories) < 100 :
            short_trajectories.append(trajectory)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,4))
ax1 = axes[0]
for trajectory in long_trajectories :
    ax1.plot(trajectory[:,0], trajectory[:,1])
ax1.set_title("Long trajectories", fontsize=20)
ax1.set_xlabel(r"X [$\mu$m]")
ax1.set_ylabel(r"Y [$\mu$m]")
    
ax2 = axes[1]
for trajectory in short_trajectories :
    ax2.plot(trajectory[:,0], trajectory[:,1])
ax2.set_title("Short trajectories", fontsize=20)
ax2.set_xlabel(r"X [$\mu$m]")
ax2.set_ylabel(r"Y [$\mu$m]")

# final touches
plt.subplots_adjust(bottom=0.25, top=0.75)
plt.show()

By a quick visual inspection, it seems that there is a bias here: the long trajectories are more spatially compact than the short ones. Let's try to make a quantitative estimate of this bias. I'll measure the radius of gyration of the trajectories. This is defined for a trajectory described by the position vectors $\{\vec{r}_i\}_1^N$ as:

$r_{gyr} = \frac{1}{N}\sum_{i=1}^N |\vec{r}_i - \vec{r}_{cm}|$

where $r_{cm}$ is the "center of mass" of the trajectory, defined as

$\vec{r}_{cm} = \frac{1}{N} \sum_{i=1}^N \vec{r}_i$

In [None]:
def radius_of_gyration(trajectory) :
    rcm = trajectory.sum(axis=0)/trajectory.shape[0]
    rdiff = trajectory-rcm
    return rcm, np.linalg.norm(rdiff, axis=1).mean()

Let's test this function and give a visual representation.

In [None]:
traj = short_trajectories[2]
rcm, rgyr = radius_of_gyration(traj)

circle = plt.Circle(rcm, rgyr, fill=False, linewidth=2)
fig, ax = plt.subplots()
ax.plot(traj[:,0], traj[:,1], 'o--')
ax.add_artist(circle)
ax.set_xlabel(r"X [$\mu$m]")
ax.set_ylabel(r"Y [$\mu$m]")
plt.show()

This works. Let's now take a global approach, and calculate the radius of gyration of all the trajectories and study the relationship between trajectory length and radius of gyration.

In [None]:
# calculate length versus radius of gyration
lg = []
for experiment, trajectories in experiments.items() :
    for trajectory in trajectories :
        rcm, rgyr = radius_of_gyration(trajectory)
        lg.append([len(trajectory), rgyr])
lg = np.array(lg)

In [None]:
def density_colors (x,y) :
    """
    Calculate the gaussian kernel density of the points that we want to look at:
    this way we will be able to color-code the points on the plot by the
    density of the neighbouring points. Taken from
    http://stackoverflow.com/a/20107592/2312821
    """
    # first calculate the Gaussian kernel density
    xy = np.vstack([x,y])
    z = stats.gaussian_kde(xy)(xy)
    # we need then to sort the output, so that the points with highest density
    # will be plotted last
    idx = z.argsort()
    return z, idx

In [None]:
# calculate the density of the points
z, idx = density_colors(lg[:,0], lg[:,1])

In [None]:
fig, ax = plt.subplots()
cax = ax.scatter(lg[idx,0], lg[idx,1], c=np.log(z[idx]), s=10, edgecolor='')
plt.xlabel("Length of trajectory")
plt.ylabel("Radius of gyration")
cbar = plt.colorbar(cax)
cbar.set_label("log Density")
plt.show()

This graph shows:

1. that the longest trajectories are indeed the most compact
2. that the majority of the trajectories are either short and expanded or long and compact

It means that there is indeed a bias: because of the way that the tracks are detected, it will not be possible to detect a single particle for a long time, unless it is trapped at a point in space.

## Biased directions

In the figure above where I showed long and short trajectories, there also was something peculiar: the short trajectories all seemed to be pointing in the same direction. Let's study this cell by cell.

In [None]:
def phi(R) :
    Rnorm = R/np.linalg.norm(R, axis=1)[:, None]
    phiabs = np.arccos(Rnorm[:,0])
    phisign = R[:,1]<0
    phiabs[phisign] += 2*(np.pi-phiabs[phisign])
    return phiabs

def trajectory_directions(trajectory) :
    rdiff = np.diff(trajectory, axis=0)
    nullrows = np.logical_and(rdiff[:,0] == 0,
                              rdiff[:,1] == 0)
    return phi(rdiff[~nullrows])

In [None]:
phis = trajectory_directions(trajectory)
plt.plot(trajectory[:,0], trajectory[:,1], 'o--', markersize=8)
plt.xlabel("X [microns]", fontsize=24)
plt.ylabel("Y [microns]", fontsize=24)
for i in range(trajectory.shape[0]) :
    plt.text(trajectory[i,0], trajectory[i,1], i+1, fontsize=14)
    if i>0 :
        print("phi_%d = %.4f radians (%.2f degrees)"%(i-1,phis[i-1],phis[i-1]*180/np.pi))

Armed with a function that tells us what is the absolute angle $\phi$ that the particle travels, with respect to the $x$ axis, let's now calculate all the angles for all the trajectories, cell by cell.

In [None]:
# iteration over the cells
phi_cells = {}
for experiment, trajectories in experiments.items() :
    # calculate the angles cell-wise
    phis = []
    for trajectory in trajectories :
         phis.extend(trajectory_directions(trajectory))
    phi_cells[experiment] = np.array(phis)

In [None]:
def plot_angles(ax, angles, bins, title) :
    # the first thing is to create the histogram of the angles, with the specified
    # number of bins
    counts, angle_edges = np.histogram(angles, bins = bins)
    
    # create a vector corresponding to the centers of the bins
    angle_centers = angle_edges[1:] - np.ediff1d(angle_edges)
        
    # plot the histogram
    bars = ax.bar(angle_centers, counts, width = 2*np.pi/bins, edgecolor = 'k')
    
    # finishing touches

    # ax.set_title(title, y = 1.1)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    return fig, ax

In [None]:
def uniform(x) :
    y = np.zeros_like(x)
    mask = np.logical_and(x>0, x<1)
    y[mask] = x[mask]
    return y

In [None]:
bad_cells = []
for experiment, phi_vals in phi_cells.items() :
    # check that we have at least some points
    if len(phi_vals) == 0 :
        continue
        
    d, p = kuiper.kuiper(phi_vals/(2*np.pi), uniform)
    if p < 0.1 :
        fig = plt.figure(figsize=(10,5))
        
        # plot angles
        ax = plt.subplot(121, projection = 'polar')
        plot_angles(ax, phi_vals[~np.isnan(phi_vals)], 16, experiment)
        ax.set_title('p = %.2f (N = %d)'%(p, len(phi_vals)))
        
        # plot trajectories
        ax = plt.subplot(122)
        for trajectory in experiments[experiment] :
            ax.plot(trajectory[:,0], trajectory[:,1])
        ax.set_title(experiment.lstrip(spt_rootdir))
        bad_cells.append(experiment.lstrip(spt_rootdir))
        plt.show()

In [None]:
bad_cells