In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os
# read in data
from utils import get_matrix_features, generate_all_permutations, flatten_3d_array, get_custom_matrix_features
path_to_data = os.path.abspath('data')
path_to_colonies = os.path.abspath('data/colonies')


In [2]:
# read data and save in matrix form
threshold_mode = 'count'
num_nn = 4
colony0_lineage_GT_path = '/home/farzaneh/Documents/Bread/bread/src/bread/tests/data/V2022_09_19_HTB2_mCh_MYO1-GFP_50_ms/FOV0_lineage_T0_to_T146.csv'

colony0_features = pd.read_csv(os.path.join(
    path_to_data, 'colony0_candidate_features_{}_{}_nn.csv'.format(threshold_mode, num_nn)))
colony0_lineage_gt = pd.read_csv(colony0_lineage_GT_path).rename(
    columns={'parent_id': 'parent_GT'})
colony0_lineage_gt['colony'] = [0 for i in range(len(colony0_lineage_gt))]


# other colonies
colonies_gt = pd.DataFrame()
for i in [1, 2, 3, 4, 5]:
    temp_colony = pd.read_csv(os.path.join(
        path_to_colonies, 'colony00{}_lineage.csv'.format(i)))
    temp_colony['colony'] = i
    colonies_gt = pd.concat([colonies_gt, temp_colony])
colonies_gt = colonies_gt.reset_index(drop=True)
colonies_gt.rename(columns={'# parent_id': 'parent_GT'}, inplace=True)

colonies_features = pd.DataFrame()
for i in [1, 2, 3, 4, 5]:
    colonies_features = pd.concat([pd.read_csv(os.path.join(
        path_to_colonies, 'colony00{}_candidate_features_{}_{}_nn.csv'.format(i, threshold_mode, num_nn))), colonies_features], ignore_index=True)

In [3]:
def get_age_related_features(candidate_features, lineage_gt):
    ages = []
    last_division_ages = []

    for index, row in candidate_features.iterrows():
        bud_id = row['bud_id']
        colony_id = row['colony']
        time_id = row['time_id']
        candidate_id = row['candid_id']
        
        candidate_birth_time = lineage_gt.loc[(lineage_gt['bud_id'] == candidate_id) & (
            lineage_gt['colony'] == colony_id), 'time_index'].values[0]

        candidate_age = row['time_id'] - candidate_birth_time

        candidate_divisions = lineage_gt.loc[
            (lineage_gt['parent_GT'] == candidate_id) & (lineage_gt['colony'] == colony_id) & (
                    lineage_gt['time_index'] < time_id), "time_index"].values

        if candidate_divisions.shape[0] == 0:
            candidate_last_division = candidate_birth_time
        else:
            candidate_last_division = np.max(candidate_divisions)

        candidate_last_division_age = time_id - candidate_last_division

        ages.append(candidate_age)
        last_division_ages.append(candidate_last_division_age)

    candidate_features['age'] = ages
    candidate_features['last_division_age'] = last_division_ages
    return candidate_features

In [4]:
colonies_features = get_age_related_features(colonies_features, colonies_gt)
colonies_features['age'] = colonies_features['age']*5
colonies_features['last_division_age'] = colonies_features['last_division_age']*5
colony0_features = get_age_related_features(colony0_features, colony0_lineage_gt)
colony0_features['age'] = colony0_features['age']*5
colony0_features['last_division_age'] = colony0_features['last_division_age']*5

In [5]:
feature_list = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 'age', 
                             'last_division_age']
# get matrix features
colony0_matrix_features = get_custom_matrix_features(
    colony0_features, colony0_lineage_gt, feature_list)
colonies_matrix_features = get_custom_matrix_features(colonies_features, colonies_gt, feature_list).reset_index(drop=True)
# read in data for colonies 1 to 5 and colony 0 and combine their matrices

# combine all data to make a single dataframe for all colonies
all_matrix_features = pd.concat(
    [colony0_matrix_features, colonies_matrix_features]).reset_index(drop=True)
all_matrix_features

colonies_matrix_features['features'][1].shape

(4, 12)

# Train NN for more than 10 features


In [6]:
from nn import BudDataset, LineageNN, train_nn, test_nn, cv_nn


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [48, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
models, accuracies = cv_nn(all_matrix_features, config=config)
print('accuracy for all with 10 + age + division_age : ', np.mean(accuracies), '+/-', np.std(accuracies))

In [None]:
# save best model with 12 features
import torch
test_accuracies =[]
best_accuracy = 0
best_model = models[0]
for model in models:
    _ , accuracy = test_nn(model, all_matrix_features)
    test_accuracies.append(accuracy)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        print('new best accuracy: ', accuracy)
# save the best model
torch.save(best_model.state_dict(), 'bst_nn_all_4frames_12features_[48,64,5].pth')
print('average test accuracy on same data (all):' , np.mean(test_accuracies), '+/-', np.std(test_accuracies))
print("This model has been saved: ", best_accuracy)

In [None]:
# feature_list = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
#                              'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 'age', 
#                              'last_division_age']
# # get matrix features
# colony0_matrix_features = get_custom_matrix_features(
#     colony0_features, colony0_lineage_gt, feature_list, filling_features = [-1 for i in range(12)])
# colonies_matrix_features = get_custom_matrix_features(colonies_features, colonies_gt, feature_list, filling_features = [-1 for i in range(12)]).reset_index(drop=True)
# # read in data for colonies 1 to 5 and colony 0 and combine their matrices

# # combine all data to make a single dataframe for all colonies
# all_matrix_features = pd.concat(
#     [colony0_matrix_features, colonies_matrix_features]).reset_index(drop=True)
# all_matrix_features

# colonies_matrix_features['features'][1].shape
# config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [48, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
# models, accuracies = cv_nn(all_matrix_features, config=config)
# print('accuracy for all with 10 + age + division_age : ', np.mean(accuracies), '+/-', np.std(accuracies))

## Train on colonies and test on colony 0


In [None]:
# train on colonies and test of colony 0
train_df = colonies_matrix_features
test_df = colony0_matrix_features
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [48, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
models, accuracies = cv_nn(train_df, config=config)
test_accuracies = []
for model in models:
    _ , accuracy = test_nn(model, test_df)
    test_accuracies.append(accuracy)
print('train on colony 1-5 and test on colony0\n ','accuracy: ', np.mean(test_accuracies), '+/-', np.std(test_accuracies))


In [None]:
feature_list_age = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 'age']
# get matrix features
colony0_matrix_features_age = get_custom_matrix_features(
    colony0_features, colony0_lineage_gt, feature_list_age)
colonies_matrix_features_age = get_custom_matrix_features(colonies_features, colonies_gt, feature_list_age)

all_matrix_features_age = pd.concat([colonies_matrix_features_age, colony0_matrix_features_age]).reset_index(drop=True)
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [44, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
models, accuracies = cv_nn(all_matrix_features_age, config=config)
print('accuracy nn with 10+age as features: ', np.mean(accuracies), '+/-', np.std(accuracies))

In [None]:
all_matrix_features_age['features'][5].shape


## train and test for 11 features and save best model


In [None]:
feature_list_last_division = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 'last_division_age']
# get matrix features
colony0_matrix_features_last_division = get_custom_matrix_features(
    colony0_features, colony0_lineage_gt, feature_list_last_division)
colonies_matrix_features_last_division = get_custom_matrix_features(colonies_features, colonies_gt, feature_list_last_division)

all_matrix_features_last_division = pd.concat([colonies_matrix_features_last_division, colony0_matrix_features_last_division]).reset_index(drop=True)
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [44, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
models, accuracies = cv_nn(all_matrix_features_last_division, config=config)
print('accuracy for nn with 10+last_division: ', np.mean(accuracies), '+/-', np.std(accuracies))


In [None]:

# save best model with 11 features
import torch
test_accuracies =[]
best_accuracy = 0
best_model = models[0]
for model in models:
    _ , accuracy = test_nn(model, all_matrix_features_last_division)
    test_accuracies.append(accuracy)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        print('new best accuracy: ', accuracy)
# save the best model
torch.save(best_model.state_dict(), 'bst_nn_all_4frames_11features_[44,64,5].pth')
print('average test accuracy on same data (all):' , np.mean(test_accuracies), '+/-', np.std(test_accuracies))
print("This model has been saved: ", best_accuracy)

# train and test XGBoost with age and age_division


In [None]:
from model_xgboost import run_xgboost, cv_xgboost


In [None]:
# train and test for all colonies
mean_score, std_score, models_all = cv_xgboost(all_matrix_features['features'].to_numpy(),
                                               all_matrix_features['parent_index_in_candidates'].to_numpy())
print('accuracy for all with 10 + age + division_age : ', mean_score, '+/-', std_score)

In [None]:
feature_list_last_division = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 'last_division_age']
# get matrix features
colony0_matrix_features_last_division = get_custom_matrix_features(
    colony0_features, colony0_lineage_gt, feature_list_last_division,filling_features = [-1 for i in range(12)])
colonies_matrix_features_last_division = get_custom_matrix_features(colonies_features, colonies_gt, feature_list_last_division,filling_features = [-1 for i in range(12)])

all_matrix_features_last_division = pd.concat([colonies_matrix_features_last_division, colony0_matrix_features_last_division]).reset_index(drop=True)
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [44, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
mean_score, std_score, models_all = cv_xgboost(all_matrix_features_last_division['features'].to_numpy(),
                                               all_matrix_features_last_division['parent_index_in_candidates'].to_numpy())
print('accuracy for all with 10+last_division : ', mean_score, '+/-', std_score)

In [None]:
feature_list_last_division = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 'age']
# get matrix features
colony0_matrix_features_age = get_custom_matrix_features(
    colony0_features, colony0_lineage_gt, feature_list_age)
colonies_matrix_features_age = get_custom_matrix_features(colonies_features, colonies_gt, feature_list_age)

all_matrix_features_age = pd.concat([colonies_matrix_features_age, colony0_matrix_features_age]).reset_index(drop=True)
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [44, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
mean_score, std_score, models_all = cv_xgboost(all_matrix_features_age['features'].to_numpy(),
                                               all_matrix_features_age['parent_index_in_candidates'].to_numpy())
print('accuracy for all with 10 + age : ', mean_score, '+/-', std_score)

In [None]:
# xgboost with 10 features
feature_list = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10']
# get matrix features
colony0_matrix_features = get_custom_matrix_features(
    colony0_features, colony0_lineage_gt, feature_list,filling_features = [-1 for i in range(12)])
colonies_matrix_features = get_custom_matrix_features(colonies_features, colonies_gt, feature_list,filling_features = [-1 for i in range(12)])

all_matrix_features = pd.concat([colonies_matrix_features, colony0_matrix_features]).reset_index(drop=True)
config = {'epoch_n': 100, 'patience': 10, 'lr': 0.01, 'batch_size': 256, 'layers': [44, 64, 5], 'augment': True, 'save_path': 'bst_nn_01_256_[40,64,5].pth'}
mean_score, std_score, models_all = cv_xgboost(all_matrix_features['features'].to_numpy(),
                                               all_matrix_features['parent_index_in_candidates'].to_numpy())
print('accuracy for all with 10: ', mean_score, '+/-', std_score)

# test on external subset


In [7]:
external_subset_lineage_GT_path_edited = '/home/farzaneh/Documents/TrackerTestDataset_SCerevisiae_7/subset/lineage/lineage_budlum_edited_by_code.csv'
external_subset_features_path = '/home/farzaneh/Documents/TrackerTestDataset_SCerevisiae_7/subset/features/features.csv'
threshold_mode = 'count'
num_nn = 4
path_to_features = '/home/farzaneh/Documents/TrackerTestDataset_SCerevisiae_7/subset/lineage/'

external_subset_features = pd.read_csv(os.path.join(
    path_to_features, 'external_subset_candidate_features_{}_{}_nn.csv'.format(threshold_mode, num_nn)))
external_subset_lineage_gt= pd.read_csv(external_subset_lineage_GT_path_edited)


frame_length = 5
external_subset_features = get_age_related_features(external_subset_features, external_subset_lineage_gt)
external_subset_features['age'] = external_subset_features['age']*frame_length
external_subset_features['last_division_age'] = external_subset_features['last_division_age']*frame_length




In [None]:
# get matrix features for external subset with 11 features
feature_list = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
                             'feature6', 'feature7', 'feature8', 'feature9', 'feature10' , 
                             'last_division_age']
# get matrix features
external_subset_matrix_features = get_custom_matrix_features(
    external_subset_features, external_subset_lineage_gt, feature_list)

# test external subset on the best model with 11 featuers
import torch
model_path = '/home/farzaneh/Documents/Bread/bread/src/bread/algo/lineage/saved_models/bst_nn_all_4frames_11features_[44,64,5].pth'
model = LineageNN([44, 64, 5])
model.load_state_dict(torch.load(model_path))
test_df, accuracy = test_nn(model, external_subset_matrix_features)
print ('accuracy on external subset with 11 features: ', accuracy)

test accuracy 0.8440677966101695


(     Unnamed: 0  parent_GT  bud_id  time_index  colony   
 3             3          1       4           6      10  \
 4             4          2       5           8      10   
 5             5          3       6           9      10   
 6             6          1       7          18      10   
 7             7          2       8          21      10   
 ..          ...        ...     ...         ...     ...   
 298         298         30     299         118      10   
 300         300        160     301         118      10   
 301         301         27     302         118      10   
 304         304         26     305         118      10   
 307         307         96     308         118      10   
 
                                               features           candidates   
 3    [[1.0, 4.0, 1.4643841598820833, 1.533490694990...        [1, 2, 3, -3]  \
 4    [[1.0, 2.8284271247461903, 1.8341048520519745,...         [2, 3, 1, 4]   
 5    [[2.0, 2.23606797749979, 0.9843862591629124,