# Calculate Observed Transition Probabilities
In the following notebook, we calculate transition probabilities between the segments comprising calls. Probabilities are calculated based on forward transitions (i.e. the likelihood for one segment transition to another - which can either be from the same or a different segment class). 

We use code adapted from [Sainburg et al., (2020)](https://doi.org/10.1371/journal.pcbi.1008228) and [github.com/timsainb/avgn](https://github.com/timsainb/avgn_paper) to produce the figure for these transitions which are displayed as comparisons to the position of the segment class in UMAP latent space.

This notebook uses Python v3.8.13 (compared to v3.6 used in earlier notebooks).

In [1]:
from tqdm.auto import tqdm

In [2]:
import avgn

In [3]:
import pandas as pd
import numpy as np
from avgn.utils.paths import DATA_DIR, ensure_dir, FIGURE_DIR

In [4]:
DATASET_ID = "git_repos"

In [5]:
DT_ID = '2022-03-04_18-41-29'

In [6]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'segment_df_umap_combinedtidied.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,location,sex,wav_loc,key,rate,specs,umap,comb_labels,call_lab_simp,combi_lab_simp
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[14.15081, 3.406464]",DS,DS-SH-DS,DS-SH-DS SH-LH
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[5.388335, 10.057652]",SH,DS-SH-DS,DS-SH-DS SH-LH
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[12.421062, 0.6991728]",DS,DS-SH-DS,DS-SH-DS SH-LH


### Change segment labels to unique number identifier

In [7]:
label_dict = {lab:i for i, lab in enumerate(np.unique(seg_df['comb_labels'].values))}
seg_df['comb_labels_num'] = [label_dict[i] for i in seg_df.comb_labels.values]
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,sex,wav_loc,key,rate,specs,umap,comb_labels,call_lab_simp,combi_lab_simp,comb_labels_num
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[14.15081, 3.406464]",DS,DS-SH-DS,DS-SH-DS SH-LH,0
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[5.388335, 10.057652]",SH,DS-SH-DS,DS-SH-DS SH-LH,3
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[12.421062, 0.6991728]",DS,DS-SH-DS,DS-SH-DS SH-LH,0


### Describe Transition Probability Data
- for transition between segments in calls

In [8]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd

In [9]:
unique_states = np.unique(seg_df['comb_labels_num']) #number of unique segments
lab_dict = {lab: i for i, lab in enumerate(unique_states)} #create a dictionary of segments using the number instead of the string label

In [10]:
np.unique(seg_df['comb_labels'])

array(['DS', 'LH', 'NL', 'SH'], dtype=object)

In [11]:
#DS: 0, LH:1, NL:2, SH:3
lab_dict

{0: 0, 1: 1, 2: 2, 3: 3}

In [12]:
#create an array of calls using unique number identifiers instead of segment labels
calls = [
    list(seg_df[seg_df.call_unique_num == callid]["comb_labels_num"].values)
    for callid in seg_df.call_unique_num.unique()
]
print(calls[:3])

[[0, 3, 0], [3, 3, 1], [0]]


In [13]:
#DS-SH-DS call (first call in seg_df) - described now as 0-3-0
calls[0]

[0, 3, 0]

In [14]:
##calculate the starting probabilities - shown in order DS, LH, NL, SH
seg_starts = np.array([i[0] for i in calls])
start_probs = np.array([np.sum(seg_starts == i) / len(calls) for i in unique_states])
start_probs

array([0.09447415, 0.13190731, 0.3030303 , 0.47058824])

In [15]:
#calculate end probabilities - shown in order DS, LH, NL, SH
end_states = np.array([i[-1] for i in calls])
end_probs = np.array([np.sum(end_states == i) / (np.sum(np.concatenate(calls) == i) + 1)
    for i in np.arange(len(unique_states))
])
end_probs            

array([0.81876333, 0.7254902 , 0.14942529, 0.00612245])

In [16]:
# transition probs - shown in order DS, LH, NL, SH
trans_mat = np.zeros((len(unique_states), len(unique_states)))
for seg in calls:
    for i, j in zip(seg[:-1], seg[1:]):
        trans_mat[i, j] += 1
# smooth to nonzero probabilities
trans_mat = (trans_mat.T / trans_mat.sum(axis=1)).T  # np.sum(trans_mat, axis=1)
trans_mat

array([[0.07142857, 0.17857143, 0.        , 0.75      ],
       [1.        , 0.        , 0.        , 0.        ],
       [0.79591837, 0.        , 0.        , 0.20408163],
       [0.48765432, 0.2345679 , 0.00617284, 0.27160494]])

In [17]:
#create dataframe of transition matrix
trans_df = pd.DataFrame(trans_mat)
trans_df

Unnamed: 0,0,1,2,3
0,0.071429,0.178571,0.0,0.75
1,1.0,0.0,0.0,0.0
2,0.795918,0.0,0.0,0.204082
3,0.487654,0.234568,0.006173,0.271605


In [18]:
flat_mat = trans_mat.flatten()
df = pd.DataFrame([flat_mat], columns = [
    "DS_DS", "DS_LH", "DS_NL", "DS_SH", 
    "LH_DS", "LH_LH", "LH_NL", "LH_SH",
    "NL_DS", "NL_LH", "NL_NL", "NL_SH",
    "SH_DS", "SH_LH", "SH_NL", "SH_SH"
])
df

Unnamed: 0,DS_DS,DS_LH,DS_NL,DS_SH,LH_DS,LH_LH,LH_NL,LH_SH,NL_DS,NL_LH,NL_NL,NL_SH,SH_DS,SH_LH,SH_NL,SH_SH
0,0.071429,0.178571,0.0,0.75,1.0,0.0,0.0,0.0,0.795918,0.0,0.0,0.204082,0.487654,0.234568,0.006173,0.271605


In [19]:
#save df
save_loc = DATA_DIR / DATASET_ID / 'Monte_Carlo' /  'observed_transitions.pickle'
ensure_dir(save_loc.as_posix())
df.to_pickle(save_loc)