# Calculate Observed Transition Probabilities
In the following notebook, we calculate transition probabilities between the calls comprising combinations. Probabilities are calculated based on forward transitions (i.e. the likelihood for one call transition to another - which can either be from the same or a different class). 

We use code adapted from [Sainburg et al., (2020)](https://doi.org/10.1371/journal.pcbi.1008228) and [github.com/timsainb/avgn](https://github.com/timsainb/avgn_paper) to produce the figure for these transitions which are displayed as comparisons to the position of the call class in UMAP latent space.

This notebook uses Python v3.8.13 (compared to v3.6 used in earlier notebooks).

In [1]:
from tqdm.auto import tqdm

In [2]:
import avgn

In [3]:
import pandas as pd
import numpy as np
from avgn.utils.paths import DATA_DIR, ensure_dir, FIGURE_DIR

In [5]:
DATASET_ID = "git_repos_call"

In [6]:
DT_ID = '2022-03-12_17-46-00'

In [7]:
call_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'call_umap_grouped.pickle')
call_df[:3]

Unnamed: 0_level_0,start_time,end_time,labels,indv,indvi,filename,group,location,sex,wav_loc,...,comb_labels,umap,spectrogram,call_lab_simp,combi_lab_simp,call_unique_num,call_pos_combi,combi_label,combi_unique_num,simp
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.753604,0.92116,DSSHDS,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,...,DSSHDS,"[5.7770762, 7.997407]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS-SH-DS,DS-SH-DS SH-LH,0,0,DSSHDS SHSHLH,0,Other Calls
1,0.932017,1.36713,SHSHLH,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,...,SHSHLH,"[1.8122675, 5.094298]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH-LH,DS-SH-DS SH-LH,1,1,DSSHDS SHSHLH,0,Contains LH Segment
2,1.218085,1.308841,DS,MGGY,0,BWY MGGY Call Combo 1 300719 AM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,...,DS,"[9.353501, 10.055656]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS SH-LH,2,0,DS USC,1,Other Calls


In [8]:
len(call_df)

561

### Change call class labels to unique number identifier

In [9]:
label_dict = {lab:i for i, lab in enumerate(np.unique(call_df['simp'].values))}
call_df['simp_num'] = [label_dict[i] for i in call_df.simp.values]
call_df[:3]

Unnamed: 0_level_0,start_time,end_time,labels,indv,indvi,filename,group,location,sex,wav_loc,...,umap,spectrogram,call_lab_simp,combi_lab_simp,call_unique_num,call_pos_combi,combi_label,combi_unique_num,simp,simp_num
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.753604,0.92116,DSSHDS,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,...,"[5.7770762, 7.997407]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS-SH-DS,DS-SH-DS SH-LH,0,0,DSSHDS SHSHLH,0,Other Calls,2
1,0.932017,1.36713,SHSHLH,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,...,"[1.8122675, 5.094298]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH-LH,DS-SH-DS SH-LH,1,1,DSSHDS SHSHLH,0,Contains LH Segment,0
2,1.218085,1.308841,DS,MGGY,0,BWY MGGY Call Combo 1 300719 AM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,...,"[9.353501, 10.055656]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS SH-LH,2,0,DS USC,1,Other Calls,2


### Describe Transition Probability Data
- for transition between calls in combinations

In [10]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd

In [11]:
unique_states = np.unique(call_df['simp_num']) #number of unique call classes
lab_dict = {lab: i for i, lab in enumerate(unique_states)} #create a dictionary of calls using the number instead of the string label

In [12]:
np.unique(call_df['simp'])

array(['Contains LH Segment', 'NL Segment Alone', 'Other Calls'],
      dtype=object)

In [14]:
#Contains LH Segment: 0, NL Segment Alone:1, Other Calls:2
lab_dict

{0: 0, 1: 1, 2: 2}

In [16]:
#create an array of combis using unique number identifiers instead of call class labels
combis = [
    list(call_df[call_df.combi_unique_num == combiid]["simp_num"].values)
    for combiid in call_df.combi_unique_num.unique()
]
print(combis[:3])

[[2, 0], [2, 0], [2, 0]]


In [17]:
#[Other Call]-[Call Contains LH Segment] = (first combi in call_df) - described now as 2-0
combis[0]

[2, 0]

In [18]:
##calculate the starting probabilities - shown in order 'Contains LH Segment', 'NL Segment Alone', 'Other Calls'
call_starts = np.array([i[0] for i in combis])
start_probs = np.array([np.sum(call_starts == i) / len(combis) for i in unique_states])
start_probs

array([0.       , 0.1036036, 0.8963964])

In [19]:
#calculate end probabilities - shown in order 'Contains LH Segment', 'NL Segment Alone', 'Other Calls'
end_states = np.array([i[-1] for i in combis])
end_probs = np.array([np.sum(end_states == i) / (np.sum(np.concatenate(combis) == i) + 1)
    for i in np.arange(len(unique_states))
])
end_probs            

array([0.96568627, 0.11111111, 0.06606607])

In [20]:
# transition probs - shown in order 'Contains LH Segment', 'NL Segment Alone', 'Other Calls'
trans_mat = np.zeros((len(unique_states), len(unique_states)))
for call in combis:
    for i, j in zip(call[:-1], call[1:]):
        trans_mat[i, j] += 1
# smooth to nonzero probabilities
trans_mat = (trans_mat.T / trans_mat.sum(axis=1)).T  # np.sum(trans_mat, axis=1)
trans_mat

array([[0.33333333, 0.        , 0.66666667],
       [0.        , 0.        , 1.        ],
       [0.6483871 , 0.00967742, 0.34193548]])

In [21]:
#create dataframe of transition matrix
trans_df = pd.DataFrame(trans_mat)
trans_df

Unnamed: 0,0,1,2
0,0.333333,0.0,0.666667
1,0.0,0.0,1.0
2,0.648387,0.009677,0.341935


In [22]:
flat_mat = trans_mat.flatten()
df = pd.DataFrame([flat_mat], columns = [
    "0-0", "0-1", "0-2", "1-0", 
    "1-1", "1-2", "2-0", "2-1",
    "2-2"
])
df

Unnamed: 0,0-0,0-1,0-2,1-0,1-1,1-2,2-0,2-1,2-2
0,0.333333,0.0,0.666667,0.0,0.0,1.0,0.648387,0.009677,0.341935


In [25]:
#save df
save_loc = DATA_DIR / 'manuscript' / 'Monte_Carlo_Calls' /  'observed_call_transitions.pickle'
ensure_dir(save_loc.as_posix())
df.to_pickle(save_loc)