In [1]:
from google.colab import drive

ROOT = '/content/drive'     # default for the drive
drive.mount(ROOT)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sktime



In [3]:
from keras import models
from keras import layers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import linregress
from sktime.datatypes._panel._convert import (
    from_multi_index_to_nested,
    from_multi_index_to_3d_numpy,
    from_nested_to_multi_index,
    from_nested_to_3d_numpy,
)
from sktime.utils.data_io import make_multi_index_dataframe
import sys

sys.path.insert(1, "/content/drive/My Drive/Git/InceptionTime")
from classifiers import inception


In [4]:
''' 
read the data 
'''

directory = Path("/content/drive/My Drive/Git/n_track_ML/scripts/")
data_file = directory / '63455ea_data_chromatin_live.csv'
test_data_file = directory / 'a286935_data_chromatin_live.csv'

data = pd.read_csv(data_file)
test_data = pd.read_csv(test_data_file)

# initial filtering based on experimental setup
def initial_filtering(data):
    data = data[~data["comment"].isin(["stress_control"])]
    data = data[~data["comment"].isin(["H2B"])]
    data = data[data["guide"].str.contains('1398') | data["guide"].str.contains('1514')]
    data = data[data["time"] < 40]

    return data

#data = initial_filtering(data)

data.head()
#data.info()

Unnamed: 0.1,file,particle,script_version_git,date,guide,time,serum_conc_percent,frame,y,x,y_micron,diff_y_micron,x_micron,diff_x_micron,diff_xy_micron,diff_xy_micron**2,area_micron,perimeter_au,perimeter_au_norm,min_dist_pxs,min_dist_micron,sqrt(area/pi),min_dist/(sqrt(area/pi)),comment,comment_long,Unnamed: 0,mass,size
0,20200212marianasCh2_lamina_draq.sld_-_Capture_10,0,c5b01d421acbafbe247a0f6af2f9b108bc64957d,20200212,pl_1398_chr1,0,10.0,0,157.903318,263.50218,13.074395,,21.817982,,,,517.067512,3112,21.335377,67.858866,5.618714,12.829174,0.437964,LaminA,"2020_0212_0317_0325_0325d2_v0.41_filtered, ch1...",0.0,703266.6804,2.327784
1,20200212marianasCh2_lamina_draq.sld_-_Capture_10,0,c5b01d421acbafbe247a0f6af2f9b108bc64957d,20200212,pl_1398_chr1,0,10.0,1,157.460077,263.730134,13.037695,-0.0367,21.836856,0.018875,0.041269,0.001703,517.458295,3113,21.342232,67.370626,5.578288,12.834021,0.434649,LaminA,"2020_0212_0317_0325_0325d2_v0.41_filtered, ch1...",2.0,660743.7182,2.311907
2,20200212marianasCh2_lamina_draq.sld_-_Capture_10,0,c5b01d421acbafbe247a0f6af2f9b108bc64957d,20200212,pl_1398_chr1,0,10.0,2,157.695945,263.345917,13.057225,0.01953,21.805043,-0.031813,0.03733,0.001393,517.814798,3124,21.417647,67.170866,5.561748,12.838441,0.433211,LaminA,"2020_0212_0317_0325_0325d2_v0.41_filtered, ch1...",4.0,624646.8149,2.336246
3,20200212marianasCh2_lamina_draq.sld_-_Capture_10,0,c5b01d421acbafbe247a0f6af2f9b108bc64957d,20200212,pl_1398_chr1,0,10.0,3,158.081874,262.945842,13.08918,0.031955,21.771917,-0.033126,0.046027,0.002118,517.362313,3120,21.390223,67.721766,5.607363,12.832831,0.436954,LaminA,"2020_0212_0317_0325_0325d2_v0.41_filtered, ch1...",6.0,588878.5174,2.337342
4,20200212marianasCh2_lamina_draq.sld_-_Capture_10,0,c5b01d421acbafbe247a0f6af2f9b108bc64957d,20200212,pl_1398_chr1,0,10.0,4,157.598905,263.285584,13.04919,-0.03999,21.800048,0.028131,0.048893,0.002391,517.13607,3129,21.451926,67.132238,5.55855,12.830024,0.433245,LaminA,"2020_0212_0317_0325_0325d2_v0.41_filtered, ch1...",8.0,557282.8026,2.361917


In [5]:
def normalize_xy(data):
    # add x and y max per time series
    data = data.join(data.groupby(['file','particle'])['x'].max(), on=['file','particle'], rsuffix='_max')
    data = data.join(data.groupby(['file','particle'])['y'].max(), on=['file','particle'], rsuffix='_max')
    data = data.join(data.groupby(['file','particle'])['x'].min(), on=['file','particle'], rsuffix='_min')
    data = data.join(data.groupby(['file','particle'])['y'].min(), on=['file','particle'], rsuffix='_min')

    # normalize x and y
    data['x_norm'] = data['x'] - data['x_min']
    data['y_norm'] = data['y'] - data['y_min']

    return data

#data = normalize_xy(data)

def create_instance_index(data):
    # combine file and particle columns for using as instance index later on
    data['fp'] = data['file'] + '__' + data['particle'].astype(str)
    return data

#data = create_instance_index(data)
#data.head()


In [6]:
def data_to_multi_index(data):
    datam = data.set_index(['fp','frame'])

    datam = datam[['x_norm', 'y_norm', 'min_dist_pxs','serum_conc_percent']]
    return datam

#datam = data_to_multi_index(data)
#datam.head()


In [7]:
def nested_max(row, col_name='col'):
    return row[col_name].max()

def nested_size(row, col_name='col'):
    return row[col_name].size

def format_class_col(datan):
    datan['class'] = datan.apply(nested_max, axis=1, col_name='serum_conc_percent')
    datan['class'] = (datan['class'] / 10).astype('int')

    datan = datan.drop(columns=['serum_conc_percent'])
    return datan

def add_nframes_col(datan):
    datan['nframes'] = datan.apply(nested_size, axis=1, col_name='x_norm')
    return datan

#datan = from_multi_index_to_nested(datam, instance_index='fp')
#datan = add_nframes_col(datan)
#datan = format_class_col(datan)


In [8]:
#datan['class'].unique()
#datan.head(2)
#datan.groupby('nframes').count()

In [9]:
def fix_unequal_frame_counts(datan):
    # drop all series where frame count is not 30
    datan = datan[datan['nframes']==30]
    return datan
    
#datan = fix_unequal_frame_counts(datan)
#datan.groupby('nframes').count()

def separate_observations_and_classes(datan):
    # separate class vector...
    y = datan['class'].values
    datan = datan.drop(columns=['class', 'nframes'])
    #print(datan.head())

    # ... and observations
    X = from_nested_to_3d_numpy(datan)
    return X,y

#X,y = separate_observations_and_classes(datan)


In [10]:
#print(X.shape)
#print(y.shape)
#print(Y.shape)
#print(Y[:15,:])

In [11]:
import keras
import sklearn
from sklearn.model_selection import train_test_split

def prepare_data(X,y,test_size=0.33):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=43)

    nb_classes = len(np.unique(np.concatenate((y_train, y_test), axis=0)))

    # make the min to zero of labels
    #y_train, y_test = transform_labels(y_train, y_test)

    # save orignal y because later we will use binary
    y_true = y_test.astype(np.int64)
    y_true_train = y_train.astype(np.int64)
    # transform the labels from integers to one hot vectors
    enc = sklearn.preprocessing.OneHotEncoder()
    enc.fit(np.concatenate((y_train, y_test), axis=0).reshape(-1, 1))
    y_train = enc.transform(y_train.reshape(-1, 1)).toarray()
    y_test = enc.transform(y_test.reshape(-1, 1)).toarray()

    if len(x_train.shape) == 2:  # if univariate
        # add a dimension to make it multivariate with one dimension
        x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
        x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

    return x_train, y_train, x_test, y_test, y_true, nb_classes, y_true_train, enc

'''
def fit_classifier():
    input_shape = x_train.shape[1:]

    classifier = create_classifier(classifier_name, input_shape, nb_classes,
                                   output_directory)

    classifier.fit(x_train, y_train, x_test, y_test, y_true)


def create_classifier(classifier_name, input_shape, nb_classes, output_directory,
                      verbose=False, build=True):
    if classifier_name == 'nne':
        from classifiers import nne
        return nne.Classifier_NNE(output_directory, input_shape,
                                  nb_classes, verbose)
    if classifier_name == 'inception':
        from classifiers import inception
        return inception.Classifier_INCEPTION(output_directory, input_shape, nb_classes, verbose,
                                              build=build)
'''





"\ndef fit_classifier():\n    input_shape = x_train.shape[1:]\n\n    classifier = create_classifier(classifier_name, input_shape, nb_classes,\n                                   output_directory)\n\n    classifier.fit(x_train, y_train, x_test, y_test, y_true)\n\n\ndef create_classifier(classifier_name, input_shape, nb_classes, output_directory,\n                      verbose=False, build=True):\n    if classifier_name == 'nne':\n        from classifiers import nne\n        return nne.Classifier_NNE(output_directory, input_shape,\n                                  nb_classes, verbose)\n    if classifier_name == 'inception':\n        from classifiers import inception\n        return inception.Classifier_INCEPTION(output_directory, input_shape, nb_classes, verbose,\n                                              build=build)\n"

In [12]:
def pipeline_xy(data):
    data = initial_filtering(data)
    data = normalize_xy(data)
    data = create_instance_index(data)
    datam = data_to_multi_index(data)
    datan = from_multi_index_to_nested(datam, instance_index='fp')
    datan = format_class_col(datan)
    datan = add_nframes_col(datan)
    datan = fix_unequal_frame_counts(datan)
    X,y = separate_observations_and_classes(datan)
    return X,y




In [13]:
data = pd.read_csv(data_file)
X, y = pipeline_xy(data)

test_data = pd.read_csv(test_data_file)
test_X, test_y = pipeline_xy(test_data)


In [14]:
#self_test_data = pd.read_csv(data_file)
#self_test_X, self_test_y = pipeline_xy(self_test_data)


In [15]:
test_size = 0.15
x_train, y_train, x_test, y_test, y_true, nb_classes, y_true_train, enc = prepare_data(X,y,test_size=test_size)

output_directory = ROOT + "/My Drive/Work/InceptionTime/test5/"
#output_directory = ROOT + "/My Drive/Work/InceptionTime/test2"
input_shape = X.shape[1:]
nb_epochs = 250
verbose = False

classifier = inception.Classifier_INCEPTION(output_directory, input_shape, nb_classes, \
                                            nb_epochs=nb_epochs, verbose=verbose)



In [16]:
classifier.fit(x_train, y_train, x_test, y_test, y_true, plot_test_acc=True)

print('\t\t\t\tDONE')


findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


				DONE


In [17]:
classifier.predict(test_X, test_y, None, None, None, return_df_metrics=True)

Unnamed: 0,precision,accuracy,recall,duration
0,0.85753,0.858131,0.858252,0.0


In [18]:
#classifier.predict(self_test_X, self_test_y, None, None, None, return_df_metrics=True)

In [19]:
history = pd.read_csv(output_directory + "history.csv")
history.head()

Unnamed: 0,loss,accuracy,val_loss,val_accuracy,lr
0,0.90393,0.481633,0.670119,0.568182,0.001
1,0.685488,0.567347,0.688447,0.568182,0.001
2,0.639732,0.583673,0.664651,0.545455,0.001
3,0.603579,0.669388,0.654073,0.545455,0.001
4,0.610163,0.62449,0.683683,0.522727,0.001


In [21]:
data.groupby(by=['serum_conc_percent'])['file'].count()

serum_conc_percent
0.3     29237
10.0    21764
Name: file, dtype: int64

In [22]:
test_X.shape

(289, 3, 30)