<a href="https://colab.research.google.com/github/sarabert96/Colexification/blob/main/04_Bootstrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bootstrap

In [9]:
import pandas as pd
import numpy as np
from collections import Counter
import random

Import cleaned version of colexification dataframe from github

In [2]:
url = 'https://raw.githubusercontent.com/sarabert96/Colexification/main/data/df_colexifications.csv'
df_colex = pd.read_csv(url)

In [3]:
df_colex.groupby('Family').size()

Family
Abkhaz-Adyge     321
Abun               3
Afro-Asiatic    2736
Aikanã            21
Ainu              74
                ... 
Yuat               7
Yukaghir         177
Yámana           142
Zamucoan         191
Zuni             153
Length: 183, dtype: int64

**knowing the data**: 183 linguistic fam

Add concepticon_pair column for future purposes

In [4]:
df_colex['Concepticon_pair']=list(zip(df_colex['Concepticon_Gloss.x'],df_colex['Concepticon_Gloss.y']))

In [5]:
df_colex

Unnamed: 0,clics_form,Concepticon_ID.x,Glottocode,Concepticon_Gloss.x,Family,variety,Concepticon_ID.y,Concepticon_Gloss.y,Concepticon_pair
0,s@,1369,hrus1242,GOLD,Hruso,Hruso Aka Jamiri,1927,BAMBOO,"(GOLD, BAMBOO)"
1,s@,1369,hrus1242,GOLD,Hruso,Hruso Aka Jamiri,946,BLOOD,"(GOLD, BLOOD)"
2,avir@,1035,miji1239,GOOD,Sino-Tibetan,Dammai Dibin,923,LOVE,"(GOOD, LOVE)"
3,dzju,1425,hrus1242,GREEN,Hruso,Hruso Aka Jamiri,1424,YELLOW,"(GREEN, YELLOW)"
4,lah,1277,dakp1242,HAND,Sino-Tibetan,Monpa Changprong,639,MOUNTAIN,"(HAND, MOUNTAIN)"
...,...,...,...,...,...,...,...,...,...
132574,eni,763,gira1247,SKIN,Nuclear Trans New Guinea,girawa,1405,NAME,"(SKIN, NAME)"
132575,wus,763,payn1244,SKIN,Nuclear Trans New Guinea,paynamar,51,SORE,"(SKIN, SORE)"
132576,mping,2458,nend1239,PERSPIRE OR SWEAT,Nuclear Trans New Guinea,nend,1257,WING,"(PERSPIRE OR SWEAT, WING)"
132577,su,51,kein1239,SORE,Nuclear Trans New Guinea,kein,1257,WING,"(SORE, WING)"


In [6]:
# df_fam --> df with families and varieties
df_fam = df_colex[['Family','Glottocode']].copy()

In [7]:
df_fam=df_fam.drop_duplicates()


In [8]:
dicFam = df_fam.groupby(['Family']).size().to_dict() # dictionary with number of variety for future purposes

### BOOTSTRAP function

In [11]:
import time

def colex_freq(x, suppDf):
  # x = df of only one colex_pair
  sumNumVar = sum(x['Num_var'])
  div = sumNumVar / 183
  newEl = [x['Colex_pair'].iloc[0], div]
  suppDf.append(newEl)


def concept_sum(x, suppDf):
  # x = df divided by concepticon inside only one family
  totLang = len(x['Glottocode']) # how many languages have that colexification faking to have different families from bootstrap --> NO SET
  # NB we don't have repetitions of Concepticon_pair in a variety, if we have it, it's just because we did the bootstrap with replacement
  newEl = [x['Family'].iloc[0], x['Concepticon_pair'].iloc[0], totLang]
  suppDf.append(newEl)


def colex_rep(x, suppDf):
  # x = df divided by family
  x.groupby('Concepticon_pair').apply(lambda y: concept_sum(y, suppDf))


def bootstrapping(cicle):
  # cicle = number of repetitions
  print ("Running bootstrapping function")
  start_time = time.time()

  listBoot = []
  
  for k in range (cicle):
    # create new df with random varieties
    new_df_fam = df_fam.groupby(['Family']).apply(lambda x: x.sample(n=(len(x['Glottocode'])), replace=True)).reset_index(drop = True)
    # create list with varieties chosen
    listVar=new_df_fam.Glottocode.tolist()
    print("List of linguistic varieties chosen")
    # new df
    boot_df = pd.DataFrame()

    for v in listVar:
      # for every variety, take all colexification and put them into boot_df
      boot_df=boot_df.append(df_colex.loc[df_colex['Glottocode'] == v], ignore_index = True)

    suppDf = [] # new empty list

    print("Taking families grouped by concepticon pair and counting number of varieties with the same colexification")

    boot_df.groupby(['Family']).apply(lambda x: colex_rep(x, suppDf))
    print("Created dataframe with colexification from chosen varieties")
    # new df from list obtained by functions
    df_sum = pd.DataFrame(suppDf, columns=['Family', 'Colex_pair', 'Var_rep'])

    df_sum['Num_var']='' # new empty column

    for i in range(len(df_sum['Colex_pair'])):
      fam = df_sum['Family'].iloc[i] # take the family for the row
      rep = df_sum['Var_rep'].iloc[i] # take var rep
      div = rep / dicFam[fam]
      df_sum['Num_var'].iloc[i] = div

    suppDf = [] # empty the list

    print("Counting colex frequency among all families")
    df_sum.groupby(['Colex_pair']).apply(lambda x: colex_freq(x, suppDf))

    listBoot.append(suppDf) # add list to list of bootstrap
    # df_colex_freq = pd.DataFrame(suppDf, columns=['Colex_pair', 'Freq']) # to save in df
    print("Finished cicle number", k)
  print("--- %s seconds ---" % (time.time() - start_time))
  return listBoot


In [12]:
random.seed(70) # random number 
lBoot = bootstrapping(3) # number is how many cicles

Running bootstrapping function
List of linguistic varieties chosen
Taking families grouped by concepticon pair and counting number of varieties with the same colexification
Created dataframe with colexification from chosen varieties


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Counting colex frequency among all families
Finished cicle number 0
List of linguistic varieties chosen
Taking families grouped by concepticon pair and counting number of varieties with the same colexification
Created dataframe with colexification from chosen varieties
Counting colex frequency among all families
Finished cicle number 1
List of linguistic varieties chosen
Taking families grouped by concepticon pair and counting number of varieties with the same colexification
Created dataframe with colexification from chosen varieties
Counting colex frequency among all families
Finished cicle number 2
--- 306.3253479003906 seconds ---


In [13]:
len(lBoot)

3

In [14]:
df_colex_freq = pd.DataFrame(lBoot[2], columns=['Colex_pair', 'Freq']) # to save in df


In [15]:
df_colex_freq

Unnamed: 0,Colex_pair,Freq
0,"(A LITTLE, WHAT)",0.000043
1,"(ABOVE, BELOW OR UNDER)",0.003643
2,"(ABOVE, BLACK)",0.000019
3,"(ABOVE, CART)",0.005464
4,"(ABOVE, CIRCLE)",0.005464
...,...,...
37314,"(YOUNGER SISTER, SIBLING)",0.002791
37315,"(YOUNGER SISTER, SISTER)",0.000032
37316,"(YOUNGER SISTER, SON)",0.000031
37317,"(YOUNGER SISTER, YOUNGER BROTHER)",0.010976
