# Generate Random Calls 
In this notebook we randomly generate new calls using those segments found in the recorded calls. Thereby, we create a dataframe containing new calls wherein segment probability of occurence is the same as in the original dataset, however the  probability of transition between segments is random. We repeat this code for 10, 000 iterations to produce a large sample size of randomly generated call arrays. We then calculate segment transition probability within each of these randomly generated call arrays. 

In [1]:
from tqdm.auto import tqdm

In [2]:
import avgn

In [3]:
import pandas as pd
import numpy as np
from avgn.utils.paths import DATA_DIR, ensure_dir, FIGURE_DIR

In [4]:
from scipy.stats import kruskal

In [5]:
DATASET_ID = "git_repos"

In [6]:
DT_ID = '2022-03-04_18-41-29'

In [7]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'segment_df_umap_combinedtidied.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,location,sex,wav_loc,key,rate,specs,umap,comb_labels,call_lab_simp,combi_lab_simp
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[14.15081, 3.406464]",DS,DS-SH-DS,DS-SH-DS SH-LH
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[5.388335, 10.057652]",SH,DS-SH-DS,DS-SH-DS SH-LH
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[12.421062, 0.6991728]",DS,DS-SH-DS,DS-SH-DS SH-LH


In [8]:
### Create unique number identifiers for each segment label class
label_dict = {lab:i for i, lab in enumerate(np.unique(seg_df['comb_labels'].values))}
seg_df['comb_labels_num'] = [label_dict[i] for i in seg_df.comb_labels.values]
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,sex,wav_loc,key,rate,specs,umap,comb_labels,call_lab_simp,combi_lab_simp,comb_labels_num
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[14.15081, 3.406464]",DS,DS-SH-DS,DS-SH-DS SH-LH,0
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[5.388335, 10.057652]",SH,DS-SH-DS,DS-SH-DS SH-LH,3
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[12.421062, 0.6991728]",DS,DS-SH-DS,DS-SH-DS SH-LH,0


In [9]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd

### Generate 10,000 simulations of transition probabilities
1. Randomly shuffle column containing segment labels (in seg_df) such that the new calls are generated which contain a random combination of segments. Overall, the number of segments in each class remains the same.
2. Calculate transition probabilities for segments within these randomly generated calls.
3. Add the calculated transition probailities to a dataframe.
4. Repeat the above three steps 10,000 times.

In [10]:
dfs = []
for randi in range(0,10000):
    seg_df[randi] = np.random.permutation(seg_df["comb_labels_num"].values) 
    unique_states = np.unique(seg_df[randi])
    lab_dict = {lab: i for i, lab in enumerate(unique_states)}
    
    calls = [
        list(seg_df[seg_df.call_unique_num == callid][randi].values)
        for callid in seg_df.call_unique_num.unique()
    ]
    # transition probs
    trans_mat = np.zeros((len(unique_states), len(unique_states)))
    for seg in calls:
        for i, j in zip(seg[:-1], seg[1:]):
            trans_mat[i, j] += 1
    # smooth to nonzero probabilities
    trans_mat = (trans_mat.T / trans_mat.sum(axis=1)).T  # np.sum(trans_mat, axis=1)
    
    flat_mat = trans_mat.flatten()

    df = pd.DataFrame([flat_mat], columns = [
        "DS_DS", "DS_LH", "DS_NL", "DS_SH", 
        "LH_DS", "LH_LH", "LH_NL", "LH_SH",
        "NL_DS", "NL_LH", "NL_NL", "NL_SH",
        "SH_DS", "SH_LH", "SH_NL", "SH_SH"
    ])
    df["rand_run"] = [randi]
    
    dfs.append(df)
    
randdfs = pd.concat(dfs)
len(randdfs)

10000

In [11]:
randdfs

Unnamed: 0,DS_DS,DS_LH,DS_NL,DS_SH,LH_DS,LH_LH,LH_NL,LH_SH,NL_DS,NL_LH,NL_NL,NL_SH,SH_DS,SH_LH,SH_NL,SH_SH,rand_run
0,0.366795,0.158301,0.123552,0.351351,0.333333,0.155039,0.124031,0.387597,0.390000,0.170000,0.090000,0.350000,0.359155,0.112676,0.169014,0.359155,0
0,0.334545,0.181818,0.105455,0.378182,0.322034,0.144068,0.127119,0.406780,0.416667,0.156250,0.125000,0.302083,0.367491,0.148410,0.120141,0.363958,1
0,0.310219,0.153285,0.145985,0.390511,0.429825,0.157895,0.087719,0.324561,0.319149,0.127660,0.170213,0.382979,0.362069,0.151724,0.137931,0.348276,2
0,0.364964,0.131387,0.098540,0.405109,0.384000,0.160000,0.096000,0.360000,0.268041,0.206186,0.134021,0.391753,0.358696,0.108696,0.155797,0.376812,3
0,0.322097,0.142322,0.142322,0.393258,0.355932,0.135593,0.152542,0.355932,0.339806,0.087379,0.155340,0.417476,0.362676,0.190141,0.095070,0.352113,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.393574,0.140562,0.108434,0.357430,0.326087,0.130435,0.166667,0.376812,0.436170,0.085106,0.127660,0.351064,0.357388,0.147766,0.127148,0.367698,9995
0,0.297398,0.200743,0.133829,0.368030,0.365079,0.150794,0.142857,0.341270,0.405941,0.118812,0.108911,0.366337,0.369565,0.144928,0.083333,0.402174,9996
0,0.320285,0.163701,0.131673,0.384342,0.327273,0.145455,0.118182,0.409091,0.281250,0.187500,0.156250,0.375000,0.378947,0.150877,0.126316,0.343860,9997
0,0.355556,0.170370,0.114815,0.359259,0.370690,0.163793,0.137931,0.327586,0.333333,0.092593,0.138889,0.435185,0.348921,0.151079,0.107914,0.392086,9998


In [12]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [13]:
#save df
save_loc = DATA_DIR / DATASET_ID / 'Monte_Carlo' /  'random_simulations.pickle'
ensure_dir(save_loc.as_posix())
randdfs.to_pickle(save_loc)