# Train a NN for tracing lineage


In [2]:
import bread
import os
import pandas as pd
import numpy as np
path_to_data = os.path.abspath('data')
path_to_colonies = os.path.abspath('data/colonies')


In [8]:
# load the 5 colony ground truth
colonies_gt = pd.DataFrame()
for i in [1, 2, 3, 4, 5]:
    temp_colony = pd.read_csv(os.path.join(
        path_to_colonies, 'colony00{}_lineage.csv'.format(i)))
    temp_colony['colony'] = i
    colonies_gt = pd.concat([colonies_gt, temp_colony])
colonies_gt = colonies_gt.reset_index(drop=True)
colonies_gt.rename(columns={'# parent_id': 'parent_GT'}, inplace=True)
colonies_gt


Unnamed: 0,parent_GT,bud_id,time_index,colony
0,-1,1,0,1
1,-1,2,0,1
2,2,3,4,1
3,1,4,7,1
4,4,5,27,1
...,...,...,...,...
499,22,34,164,5
500,13,35,166,5
501,18,36,169,5
502,3,37,175,5


In [11]:
colonies_features = pd.DataFrame()

for i in [1, 2, 3, 4, 5]:
    temp_colony = pd.read_csv(os.path.join(
        path_to_colonies, 'colony00{}_candidate_features.csv'.format(i)))
    colonies_features = pd.concat([colonies_features, temp_colony])
colonies_features = colonies_features.reset_index(drop=True)
colonies_features


Unnamed: 0,bud_id,candid_id,time_id,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,colony
0,3.0,2.0,4.0,1.000000,2.828427,2.021374,3.756543,3.857483,0.466493,0.049453,0.084671,0.334073,1.144917,1
1,4.0,1.0,7.0,2.236068,2.236068,2.273459,5.357769,6.059181,0.110091,0.026505,1.431141,0.026439,1.525806,1
2,5.0,4.0,27.0,3.000000,3.000000,1.188829,2.604521,2.924109,0.276827,0.084856,1.288088,-0.062777,1.150381,1
3,6.0,1.0,27.0,2.000000,2.236068,1.579244,3.194189,2.956979,1.438397,0.077682,1.175315,0.141315,1.533269,1
4,7.0,1.0,28.0,4.123106,4.472136,1.198652,3.331497,3.895959,0.712322,0.046682,1.465445,-0.233344,0.914119,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,37.0,18.0,175.0,2.000000,2.236068,1.591169,3.207947,2.791266,1.563045,0.050753,1.133295,0.106932,1.514439,5
1047,37.0,26.0,175.0,2.000000,2.000000,2.487022,4.316390,3.097309,1.559877,0.554893,1.323078,-0.039808,1.250352,5
1048,38.0,9.0,178.0,3.000000,3.000000,0.773591,1.949580,2.181251,0.940659,0.366947,0.839169,0.147930,1.135029,5
1049,38.0,14.0,178.0,1.414214,2.828427,1.916193,3.590413,2.653363,1.181054,0.484141,1.471914,-0.068782,1.334349,5


In [12]:
def get_matrix_features(features_all, lineage_gt):
    # Generate np array of feature sets for each bud
    df1 = lineage_gt.copy()
    # remove the rows with parent_GT = -1 (no parent) and the rows with candid_GT = -2 (disappearing buds)
    df1 = df1.loc[df1.parent_GT != -1]
    df1 = df1.loc[df1.parent_GT != -2]
    df2 = features_all.copy()

    features_list = []
    parent_index_list = []
    candidate_list = []
    for bud, colony in df1[['bud_id', 'colony']].values:
        bud_data = df2.loc[(df2['bud_id'] == bud) & (df2['colony'] == colony)]
        candidates = bud_data['candid_id'].to_numpy()
        if candidates.shape[0] < 4:
            candidates = np.pad(
                candidates, ((0, 4 - candidates.shape[0])), mode='constant', constant_values=-3)
        features = bud_data[['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10']].to_numpy()
        if features.shape[0] < 4:
            features = np.pad(features, ((
                0, 4 - features.shape[0]), (0, 0)), mode='constant', constant_values=-1)
        if features.shape[0] > 4:
            sorted_indices = np.argsort(features[:, 0])
            print('more than 4 candidates', bud, colony, candidates, sorted_indices, int(df1.loc[(df1['bud_id'] == bud) & (
                df1['colony'] == colony), 'parent_GT']))
            # slice the top 4 rows
            k = 4
            features = features[sorted_indices[:k]]
            candidates = candidates[sorted_indices[:k]]

        parent = int(df1.loc[(df1['bud_id'] == bud) & (
            df1['colony'] == colony), 'parent_GT'])
        # print(bud, colony, parent)
        # print(candidates)
        if(parent not in candidates):
            print('parent not in candidates', bud, colony, candidates, parent)
            # remove this from the df
            df1.drop(df1.loc[(df1['bud_id'] == bud) & (df1.colony == colony)].index,
                     inplace=True)
            continue
        else:
            parent_index = np.where(candidates == parent)[0][0]

        parent_index_list.append(parent_index)
        features_list.append(features)
        candidate_list.append(candidates)
    df1['features'] = features_list
    df1['candidates'] = candidate_list
    df1['parent_index_in_candidates'] = parent_index_list
    return df1


In [13]:
colonies_matrix_features = get_matrix_features(colonies_features, colonies_gt)


  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (


more than 4 candidates 69 3 [12. 19. 21. 42. 67.] [4 0 1 2 3] 12
more than 4 candidates 87 3 [18. 37. 39. 59. 71.] [1 2 0 4 3] 37
parent not in candidates 40 4 [-3. -3. -3. -3.] 24
parent not in candidates 68 4 [ 4.  8. 64. -3.] 39
parent not in candidates 105 4 [-3. -3. -3. -3.] 67


  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (
  parent = int(df1.loc[(df1['bud_id'] == bud) & (


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score


class BudDataset(Dataset):
    def __init__(self, data, augment=True):
        X = data['features'].to_numpy()
        labels = data['parent_index_in_candidates'].to_numpy()
        if(augment):
            X, labels = generate_all_permutations(X, labels)        
        X = flatten_3d_array(X)
        self.data = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.zeros(len(labels), 5)  # initialize labels as zeros
        for i, label in enumerate(labels):
            if label != -1:
                # set the position of the correct parent to 1
                self.labels[i][label] = 1.0

    def __getitem__(self, index):
        data = self.data[index]
        return self.data[index], self.labels[index]

    def __len__(self):
        return len(self.labels)

# define the neural network architecture


class LineageNN(nn.Module):
    def __init__(self):
        super(LineageNN, self).__init__()
        self.fc1 = nn.Linear(40, 32)
        # self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        # x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_nn(train_df, eval_df, epoch_n=100,p=10, save_path='bst_nn.pth' ):
    # initialize neural network
    net = LineageNN()

    # define your loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    # create training dataset and data loader

    train_bud_dataset = BudDataset(train_df)
    train_bud_dataloader = DataLoader(train_bud_dataset, batch_size=16, shuffle=True)
    eval_bud_dataset = BudDataset(eval_df)
    eval_bud_dataloader = DataLoader(eval_bud_dataset, batch_size=16, shuffle=True)

    # train your neural network
    patient = 0
    for epoch in range(epoch_n):
        running_loss = 0.0
        best_accuracy = 0.0
        #training loop
        predicted_all = []
        labels_all = []
        for i, data in enumerate(train_bud_dataloader, 0):
            inputs, labels = data

            optimizer.zero_grad()

            # forward pass
            outputs = net(inputs)

            # calculate the loss
            loss = criterion(outputs, labels)
            _, labels = torch.max(labels.data, 1)
            _, predicted = torch.max(outputs.data, 1)
            predicted_all.extend(predicted)
            labels_all.extend(labels)    

            # backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        train_accuracy = accuracy_score(labels_all, predicted_all)
        # eval loop
        predicted_all = []
        labels_all = []
        for i, data in enumerate(eval_bud_dataloader, 0):
            inputs, labels = data
            net.eval()
            with torch.no_grad():
                outputs = net(inputs)
            _, labels = torch.max(labels.data, 1)
            _, predicted = torch.max(outputs.data, 1)
            predicted_all.extend(predicted)
            labels_all.extend(labels)       
        eval_accuracy = accuracy_score(labels_all, predicted_all)
        if(accuracy > best_accuracy):
            best_accuracy = accuracy
            torch.save(net.state_dict(), save_path)
            patient = 0
        else:
            patient+=1
        if(patient > p):
            print('early stopping at ', epoch)
            break
            # print('Epoch' , i, ' loss: ', running_loss / len(bud_dataloader))
        wandb_log = {'epoch': epoch, 'patience': patient , 'eval_accuracy': eval_accuracy, 'train_accuracy': train_accuracy, 'best_accuracy': best_accuracy}
    print('patient', patient)
    print('best accuracy', best_accuracy)
    return net

def test_nn(model, test_df):
    bud_dataset = BudDataset(test_df, augment=False)
    bud_dataloader = DataLoader(bud_dataset, batch_size=len(test_df), shuffle=False)
    for i, data in enumerate(bud_dataloader, 0):
        inputs, labels = data
        with torch.no_grad():
            outputs = model(inputs)
        _, labels = torch.max(labels.data, 1)
        _, predicted = torch.max(outputs.data, 1)
        print('predicted', predicted)
        print('labels', labels)
        accuracy = accuracy_score(predicted, labels)
        print('accuracy', accuracy )
        test_df['predicted'] = predicted
        return test_df, accuracy
            

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
import wandb
# import random

# # start a new wandb run to track this script
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="lineage_tracing",
#     # track hyperparameters and run metadata
#     config={}
# )


ImportError: cannot import name 'KO_NAMES' from 'charset_normalizer.constant' (/home/farzaneh/anaconda3/envs/lineage_tracing/lib/python3.9/site-packages/charset_normalizer/constant.py)

In [13]:
train_colonies = colonies_matrix_features.loc[colonies_matrix_features['colony'] != 5]
test_colonies = colonies_matrix_features.loc[colonies_matrix_features['colony'] == 5]

model = train_nn(train_colonies, test_colonies, epoch_n=100, p=10)
results, accuracy = test_nn(model, test_colonies)


best accuracy 0.9016203703703703
predicted tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 1, 2, 0, 0, 0, 2, 0, 2])
labels tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 1, 2, 0, 0, 0, 2, 0, 2])
accuracy 0.9166666666666666


Unnamed: 0,parent_GT,bud_id,time_index,colony,features,candidates,parent_index_in_candidates
468,1,3,3,5,"[[1.0, 3.0, 2.1696624996956366, 4.386476727896...","[1.0, -3.0, -3.0, -3.0]",0
469,1,4,23,5,"[[3.0, 3.0, 0.8286378635107876, 2.068663973773...","[1.0, -3.0, -3.0, -3.0]",0
470,2,5,30,5,"[[1.0, 1.0, 1.6929497467978565, 3.561150806734...","[2.0, -3.0, -3.0, -3.0]",0
471,3,6,34,5,"[[2.0, 2.8284271247461903, 0.7581110688939684,...","[3.0, -3.0, -3.0, -3.0]",0
472,1,7,39,5,"[[2.23606797749979, 2.23606797749979, 2.376459...","[1.0, -3.0, -3.0, -3.0]",0
473,4,8,48,5,"[[1.4142135623730951, 2.23606797749979, 2.2155...","[4.0, -3.0, -3.0, -3.0]",0
474,3,9,63,5,"[[7.615773105863909, 7.810249675906654, 0.3230...","[1.0, 3.0, -3.0, -3.0]",1
475,4,10,74,5,"[[2.0, 2.0, 1.147140614871467, 2.3197811054523...","[4.0, 6.0, -3.0, -3.0]",0
476,8,11,88,5,"[[2.0, 2.23606797749979, 0.3650805478236136, 0...","[8.0, -3.0, -3.0, -3.0]",0
477,3,12,94,5,"[[2.23606797749979, 2.23606797749979, 1.241805...","[3.0, -3.0, -3.0, -3.0]",0


In [14]:
results, accuracy = test_nn(model, test_colonies)


predicted tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 1, 2, 0, 0, 0, 2, 0, 2])
labels tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 1, 2, 0, 0, 0, 2, 0, 2])
accuracy 0.9166666666666666


## Add FOV0 buds to colonies and train on all


In [15]:
import pandas as pd
colony0_segmentation_path = '/home/farzaneh/Documents/Bread/bread/src/bread/tests/data/V2022_09_19_HTB2_mCh_MYO1-GFP_50_ms/FOV0_segmentation_T0_to_T146_trimmed.h5'
colony0_lineage_GT_path = '/home/farzaneh/Documents/Bread/bread/src/bread/tests/data/V2022_09_19_HTB2_mCh_MYO1-GFP_50_ms/FOV0_lineage_T0_to_T146.csv'

colony0_lineage_gt = pd.read_csv(colony0_lineage_GT_path)
colony0_lineage_gt.rename(columns={'parent_id': 'parent_GT'}, inplace=True)


In [None]:
args = {"nn_threshold": 8.0, "flexible_nn_threshold": True,
        "num_frames_refractory": 0, "num_frames": 4, "bud_distance_max": 10.0}
colony0_features = extract_features(colony0_segmentation_path, args)
colony0_features['colony'] = [0 for i in range(len(colony0_features))]
colony0_lineage_gt = pd.read_csv(colony0_lineage_GT_path).rename(
    columns={'parent_id': 'parent_GT'})
colony0_lineage_gt['colony'] = [0 for i in range(len(colony0_lineage_gt))]
colony0_matrix_features = get_matrix_features(
    colony0_features, colony0_lineage_gt)


In [17]:
all_matrix_features = colony0_matrix_features.append(
    colonies_matrix_features).reset_index(drop=True)
all_matrix_features


Unnamed: 0,parent_GT,bud_id,time_index,colony,features,candidates,parent_index_in_candidates
0,1,5,18,0,"[[2.23606797749979, 2.8284271247461903, 2.3462...","[1.0, -3.0, -3.0, -3.0]",0
1,3,6,18,0,"[[1.0, 3.0, 3.1110787609318864, 6.903387316722...","[3.0, -3.0, -3.0, -3.0]",0
2,2,7,20,0,"[[1.0, 3.0, 0.9423480319763353, 2.690179222401...","[2.0, 3.0, 4.0, -3.0]",0
3,4,8,22,0,"[[1.0, 2.0, 2.1075425855503496, 4.077661350281...","[4.0, -3.0, -3.0, -3.0]",0
4,1,9,32,0,"[[2.0, 2.0, 1.758930190470974, 3.7266767166400...","[1.0, 5.0, -3.0, -3.0]",0
...,...,...,...,...,...,...,...
828,22,34,164,5,"[[2.0, 3.0, 1.180091771838302, 2.4426740817652...","[22.0, 25.0, -3.0, -3.0]",0
829,13,35,166,5,"[[2.0, 3.0, 0.8067932399673025, 1.901524565203...","[13.0, -3.0, -3.0, -3.0]",0
830,18,36,169,5,"[[6.324555320336759, 12.083045973594572, 0.209...","[2.0, 9.0, 18.0, -3.0]",2
831,3,37,175,5,"[[2.23606797749979, 2.23606797749979, 1.508481...","[3.0, 18.0, 26.0, -3.0]",0


In [None]:
# train and test on the combined data
test_df = all_matrix_features.sample(frac=0.2, random_state=4)
rest_df = all_matrix_features.drop(test_df.index)
eval_df = rest_df.sample(frac=0.1, random_state=4)
train_df = rest_df.drop(eval_df.index)

model = train_nn(train_df, eval_df, epoch_n=500, p=4,
                 save_path='bst_nn_all_colonies.pth')


In [30]:

model = LineageNN()
model.load_state_dict(torch.load('bst_nn_all_colonies.pth'))
model.eval()
results, accuracy = test_nn(model, test_df)


predicted tensor([2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 1, 3, 1, 1, 1, 0, 0, 0, 0, 0,
        2, 0, 0, 2, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 1, 0, 0, 2, 0, 1, 0, 1, 0, 0, 2,
        2, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2,
        3, 0, 1, 1, 0, 2, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 2,
        2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 1, 0, 1, 0, 0, 1])
labels tensor([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 2, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
        2, 0, 0, 2, 0, 0, 1, 0, 1, 2, 0, 2, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 3, 0, 1, 0, 1, 1, 1, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2,
        3, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2,
        2, 1, 0, 0, 0, 0,

In [None]:
# test different layers to see if they change the accuracy
