In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import SPTCata as spt
import matplotlib.pyplot as plt1
import numpy as np
import pandas as pd
import os, sys
import re
from collections import defaultdict

# for fitting
from scipy.optimize import curve_fit

# 2019-05-13 Filter data

I want to filter out the tracks that have low quality.

## Preliminaries

### Load data

In [None]:
# get all the directory names where we have 10 Spots tracks
datadirs = {
    'Olaparib_R5020' : '../data/1_Olaparib_R5020',
    'R5020' : '../data/2_DMSO_R5020_Control',
    'EtOH' : '../data/3_EtOH_Nohormone_Control'
}

# cycle through all the directories and do the analysis
experiments = defaultdict(list)
data = defaultdict(list)
for treatment, datadir in datadirs.items() :
    for subdir in os.listdir(datadir) :
        full_dir_name = '%s/%s'%(datadir, subdir)
        experiment = spt.SPT(full_dir_name, links=False)
        experiments[treatment].append(experiment)
        trajectories = experiment.trajectory_spots
        data[treatment].extend(trajectories)

### Track quality

Now, let's try to parse any of the files named "Track Statistics".

In [None]:
# let's pick one experiment
experiment = experiments['EtOH'][7]
track_statistics = pd.read_csv('%s/Track statistics.csv'%(experiment.datadir))

In [None]:
for index, track in track_statistics.iterrows() :
    if len(experiment.trajectory_spots[index]) != track.NUMBER_SPOTS :
        print("Error")

Okay, this seems to be working. Let's try it in large scale.

In [None]:
for experiment_batch in experiments.values() :
    for experiment in experiment_batch :
        track_statistics = pd.read_csv('%s/Track statistics.csv'%(experiment.datadir))
        for index, track in track_statistics.iterrows() :
            if len(experiment.trajectory_spots[index]) != track.NUMBER_SPOTS :
                print(experiment.datadir)
                break

No output, it works.

Now that I'm sure that every line in one file corresponds to every track in the other files, I can look at the distribution of quality values for all the tracks.

In [None]:
quality = defaultdict(list)
for treatment, experiment_batch in experiments.items() :
    for experiment in experiment_batch :
        track_statistics = pd.read_csv('%s/Track statistics.csv'%(experiment.datadir))
        for index, track in track_statistics.iterrows() :
            quality[treatment].append(track.TRACK_MEAN_QUALITY)

In [None]:
for treatment, quality_values in quality.items() :
    plt.hist(quality_values, bins = 100)
    plt.title(treatment, fontsize = 18)
    plt.xlabel("Quality", fontsize = 16)
    plt.ylabel("Count", fontsize = 16)
    plt.show()

For Olaparib and R5020 the qualities are similar. However, the EtOH experiment has a slightly different distribution, with less tracks that have high quality.

I can now decide on a threshold to apply. The way I'll do it is to modify the constructor of the `SPT` class to include an optional `quality` parameter.

In [None]:
# cycle through all the directories and do the analysis
experiments_filtered = defaultdict(list)
data_filtered = defaultdict(list)
for treatment, datadir in datadirs.items() :
    for subdir in os.listdir(datadir) :
        full_dir_name = '%s/%s'%(datadir, subdir)
        experiment = spt.SPT(full_dir_name, links=False, quality=50)
        experiments_filtered[treatment].append(experiment)
        trajectories = experiment.trajectory_spots
        data_filtered[treatment].extend(trajectories)