In [4]:
import pandas as pd 
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import utility as ut
import re

In [5]:
# read dataset

# regional data in the corpus
country = pd.read_csv('data/country_edited.tsv', sep='\t', header='infer')

# dictionary to convert region names into world area
country_code = pd.read_csv('data/country_code.tsv', sep='\t', encoding='utf8', header=None)
country_dict = {country_code.iloc[x,0]:country_code.iloc[x,1] for x in country_code.index}

## Regional Distribution


In [6]:
# regional information(long-format)
# atu_id: id number of tale type
# country: country or ethnic group 
country

Unnamed: 0,id,atu_id,#,country
0,1,1,0,Finnish
1,1,1,1,Finnish-Swedish
2,1,1,2,Estonian
3,1,1,3,Livonian
4,1,1,4,Latvian
5,1,1,5,Lithuanian
6,1,1,6,Lappish
7,1,1,7,Wepsian
8,1,1,8,Wotian
9,1,1,9,Lydian


In [7]:
# Mapping state name into world area.

# dictionary (associative array) for converting 'country' variable into world area
# distinction of according to UN Geographical Code(M49)
country_dict

{'Afghan': 'Southern Asia',
 'African': 'Northern Africa',
 'African American': 'Northern America',
 'Albanian': 'Southern Europe',
 'Algerian': 'Northern Africa',
 'American': 'Northern America',
 'Angolan': 'Middle Africa',
 'Argentine': 'South America',
 'Armenian': 'Western Asia',
 'Australian': 'Australia and New Zealand',
 'Austrian': 'Western Europe',
 'Azerbaijan': 'Western Asia',
 'Benin': 'Western Africa',
 'Bolivian': 'South America',
 'Bosnian': 'Southern Europe',
 'Brazilian': 'South America',
 'Bulgarian': 'Eastern Europe',
 'Burkina Paso': 'Western Africa',
 'Byelorussian': 'Eastern Europe',
 'Cambodian': 'South-eastern Asia',
 'Cameroon': 'Middle Africa',
 'Cape Verdian': 'Western Africa',
 'Central African': 'Middle Africa',
 'Chad': 'Middle Africa',
 'Chilean': 'South America',
 'Chinese': 'Eastern Asia',
 'Colombian': 'South America',
 'Congolese': 'Middle Africa',
 'Corsican': 'Western Europe',
 'Costa Rican': 'Central America',
 'Croatian': 'Southern Europe',
 'Cub

In [8]:
# converting into world area
country['region'] = [country_dict[country['country'][i]] if str(country_dict.get(country['country'][i], None)) not in ['nan', 'None'] else 'Others' for i in country.index]

In [9]:
country.head()

Unnamed: 0,id,atu_id,#,country,region
0,1,1,0,Finnish,Northern Europe
1,1,1,1,Finnish-Swedish,Northern Europe
2,1,1,2,Estonian,Northern Europe
3,1,1,3,Livonian,Northern Europe
4,1,1,4,Latvian,Northern Europe


In [10]:
# in this analysis, 'Jewish' and 'Gypsy' were assigned 'others' for region variable.
country[country['region'] == 'Others'].head()

Unnamed: 0,id,atu_id,#,country,region
38,1,1,38,Jewish,Others
39,1,1,39,Gypsy,Others
113,4,2,41,Jewish,Others
114,4,2,42,Gypsy,Others
155,5,2A,16,Jewish,Others


In [11]:
# count of world area (see the supplemental material)
result = country['region'].value_counts()
result.to_csv('result/region_count.tsv',sep='\t')
result

  This is separate from the ipykernel package so we can avoid doing imports until


Northern Europe                    1227
Southern Europe                     992
Eastern Europe                      863
Western Europe                      699
Western Asia                        453
Central Asia                        369
Northern Africa                     327
Eastern Asia                        254
Southern Asia                       253
Northern America                    209
South America                       167
Others                              146
Central America                     145
South-eastern Asia                  133
Eastern Africa                      122
Latin America and the Caribbean     115
Southern Africa                     113
Middle Africa                        59
Western Africa                       56
Australia and New Zealand             7
Polynesia                             6
Micronesia                            1
Melanesia                             1
Name: region, dtype: int64

In [12]:
# number of tale types which including each region

freq = country.groupby(('id','region')).size()
freq = freq.reset_index()
result = freq['region'].value_counts()
result.to_csv('result/region_count_by_tale.tsv', sep='\t')
result

  This is separate from the ipykernel package so we can avoid doing imports until
  


Northern Europe                    293
Southern Europe                    278
Eastern Europe                     264
Western Europe                     250
Western Asia                       173
Southern Asia                      156
Central Asia                       147
Eastern Asia                       145
Northern Africa                    139
Northern America                   123
Others                             120
Eastern Africa                      88
South-eastern Asia                  84
Southern Africa                     80
Central America                     80
South America                       77
Latin America and the Caribbean     75
Western Africa                      45
Middle Africa                       43
Australia and New Zealand            7
Polynesia                            6
Melanesia                            1
Micronesia                           1
Name: region, dtype: int64

## Animal distribution of each tale category


In [13]:
# read dataset (see detail for analysis.ipynb in this repository)

animals = pd.read_csv('data/animals_checked.tsv', sep='\t')
animal_code = pd.read_csv('data/animal_code.tsv', sep='\t', header=None)
animal_dict = {row[0]:row[1] for i,row in animal_code.iterrows()}
categories = pd.read_csv('data/categories_atu.tsv', sep='\t')
animals = animals.merge(categories, on=['id', 'atu_id'],how='outer')

In [14]:
# number of tale types in each categories.
animals['category'].value_counts()

wild                 110
others               106
wild_and_domestic     72
wild_and_human        60
domestic              35
Name: category, dtype: int64

In [15]:
def calc_freq(data, category):
    res = count(data[data['category'] == category]['animals'])
    result = pd.DataFrame.from_dict(res, orient='index')
    return result.sort_values(0, ascending=False)

def count(data, raw=False):
    result = Counter()
    for x in data:
        if type(x) == str:
            result = result + Counter([animal_dict.get(w.strip()) for w in x.split(',')])
    return result

In [16]:
wild = calc_freq(animals, 'wild')
wild.columns = ['wild']
wild.head(10)

Unnamed: 0,wild
fox,87
wolf,54
bear,37
rabbit,32
jackal,28
*bird,27
chicken,22
lion,21
human,21
dog,21


In [17]:
wild_and_domestic = calc_freq(animals, 'wild_and_domestic')
wild_and_domestic.columns = ['wild_and_domestic']
wild_and_domestic.head(10)

Unnamed: 0,wild_and_domestic
wolf,37
fox,23
cat,21
sheep,20
pig,15
goat,13
mouse,13
dog,12
bear,9
human,9


In [18]:
wild_and_human = calc_freq(animals, 'wild_and_human')
wild_and_human.columns = ['wild_and_human']
wild_and_human.head(10)

Unnamed: 0,wild_and_human
human,49
bear,20
wolf,18
fox,11
lion,10
horse,8
snake,7
rabbit,6
tiger,6
sheep,5


In [19]:
domestic = calc_freq(animals, 'domestic')
domestic.columns = ['domestic']
domestic.head(10)

Unnamed: 0,domestic
donkey,16
dog,15
human,11
chicken,8
mouse,6
horse,6
cat,5
goat,5
duck,5
wolf,3


In [20]:
others = calc_freq(animals, 'others')
others.columns = ['others']
others.head()

Unnamed: 0,others
*bird,55
human,23
*bug,17
crow,17
frog,16


In [21]:
# merge data into one table
result = pd.concat([wild.reset_index(), wild_and_human.reset_index(), wild_and_domestic.reset_index(), domestic.reset_index(), others.reset_index()], axis=1)
result

Unnamed: 0,index,wild,index.1,wild_and_human,index.2,wild_and_domestic,index.3,domestic,index.4,others
0,fox,87.0,human,49.0,wolf,37.0,donkey,16.0,*bird,55
1,wolf,54.0,bear,20.0,fox,23.0,dog,15.0,human,23
2,bear,37.0,wolf,18.0,cat,21.0,human,11.0,*bug,17
3,rabbit,32.0,fox,11.0,sheep,20.0,chicken,8.0,crow,17
4,jackal,28.0,lion,10.0,pig,15.0,mouse,6.0,frog,16
5,*bird,27.0,horse,8.0,goat,13.0,horse,6.0,fox,12
6,chicken,22.0,snake,7.0,mouse,13.0,cat,5.0,*vermin,12
7,lion,21.0,rabbit,6.0,dog,12.0,goat,5.0,*fish,9
8,human,21.0,tiger,6.0,bear,9.0,duck,5.0,snake,9
9,dog,21.0,sheep,5.0,human,9.0,wolf,3.0,mouse,9


In [22]:
# output top 30 data
result.iloc[:30].to_csv('result/category_animals.tsv', sep='\t')

## Animal group

In [23]:
groups = animal_code[animal_code[1].str.startswith('*')]
groups.columns = ['animal', 'group']
groups_agg = pd.DataFrame(groups.groupby('group')['animal'].apply(list).apply(lambda x:sorted(x)).apply(', '.join))
groups_agg.to_csv('result/animal_groups.tsv', sep='\t')
groups_agg

Unnamed: 0_level_0,animal
group,Unnamed: 1_level_1
*big cat,"jaguar, leopard, lynx, panther"
*bird,"bird, bittern, blackbird, cuckoo, diver, flami..."
*bug,"beetle, blindworm, butterfly, cricket, grassho..."
*fish,"fish, flounder, herring, mussel, perch, salmon..."
*large animal,"beast, boar, buffalo, hippopotamus, whale"
*raptores,"falcon, hawk"
*small animal,"badger, bat, chameleon, gopher, hedgehog, liza..."
*supernatural,"devil, dragon, dwarf, giant, god, ogre, pan, s..."
*vermin,"bedbug, flea, gnat, louse, tick"


## Distribution of motifs


In [24]:
motif = pd.read_csv('data/motifs_checked.tsv', sep='\t')
motif_list = motif['motif'].apply(lambda x:x.split(','))

# remove motifs with 'cf.''
motif_list = motif_list.apply(lambda x: [w for w in x if not re.match('cf.', w)])

# Full 
motif['full'] = motif_list

# single alphabet e.g. 'K'.
motif['major'] = motif_list.apply(lambda x: [w[0] for w in x])

# alphabet with number e.g. 'J14'
motif['minor'] = motif_list.apply(lambda x: [w.split('.')[0] for w in x])

# remove duplicate count in 'minor' (used in the analysis)
motif['counted'] = motif['minor'].apply(lambda x: list(set(x)))

# ATU index
motif['_id'] = [motif.iloc[x, 1] for x in range(len(motif))]

result = motif[['_id', 'full', 'major', 'minor', 'counted']]

In [25]:
result.head()

Unnamed: 0,_id,full,major,minor,counted
0,1,"[K371.1, K341.2, K341.2.1, K1026]","[K, K, K, K]","[K371, K341, K341, K1026]","[K371, K1026, K341]"
1,2,[K1021],[K],[K1021],[K1021]
2,2A,"[K1021.1, J758.1, J341.1]","[K, J, J]","[K1021, J758, J341]","[J341, J758, K1021]"
3,2B,[K1021.2],[K],[K1021],[K1021]
4,3,"[K473, K522.1]","[K, K]","[K473, K522]","[K473, K522]"


In [26]:
# count of motif
pd.DataFrame.from_dict(Counter([x[0] for x in ut.flatten(motif['counted'])]), orient='index')[0].sort_values(ascending=False)

K    176
J    115
B     47
A     33
L     13
U      9
W      8
N      6
Q      5
Z      4
F      3
T      2
D      1
H      1
M      1
C      1
X      1
Name: 0, dtype: int64

In [27]:
# regular expressions
re_index = re.compile('^[^\.]+')
re_number = re.compile('^\d+')
re_tail = re.compile('[^\d]+$')

# functions
def padding(string):
    # convert values for sorting
    tag = re_index.findall(string)[0]
    letter = tag[0]
    number = re_number.findall(tag[1:])[0]
    if not number:
        number = 0
    return letter + '{0:04d}'.format(int(number))
    
def get_tmi_description(motif):
    candidate = re_tail.sub('', motif)
    # if the candidate not found in the motif list, return (*missing motif index)
    result = tmi_dict.get(candidate, '(*missing motif index)') 
    return result

def get_tmi_categories(string, narrow=False):
    tag = re_index.findall(string)[0]
    letter = tag[0]
    number = re_number.findall(tag[1:])[0]
    if not number:
        number = 0
    number = int(number)
    categories = tmi_categories[np.logical_and(np.logical_and(tmi_categories['motif'] == letter, tmi_categories['start'] <= number), tmi_categories['end'] >= number)]
    if not len(categories.index):
        return ''
    if narrow:
        # choose narrowest category
        category = categories.iloc[-1]
    else:
        category = categories.iloc[0]
    result = (category['motif'] + str(category['start']) + '-' + category['motif'] + str(category['end']), category['desc'].lower())
    return result
        

In [28]:
# read Thompson Motif Index data(not included in this repository due to copyright issue)

tmi = pd.read_csv('data/tmi.tsv',sep='\t', header=None)
tmi.columns = ['index', 'value']
tmi_dict = {v['index']:v['value'] for i, v in tmi.iterrows()}

tmi.head()

Unnamed: 0,index,value
0,A1,Identity of creator.
1,A1.1,Sun-god as creator.
2,A1.2,Grandfather as creator.
3,A1.3,Stone-woman as creator.
4,A1.4,Brahma as creator.


In [29]:
# read data subcategories(not included in this repository due to copyright issue)
tmi_categories = pd.read_csv('data/tmi_sub.tsv', sep='\t')

In [30]:
# this dataset reflects hierarchical structure(see in the main paper) of TMI
# e.g.
'''
A: Mythological Motif
    A0-A499 GODS
        A100-A199 The gods in general.
        A200-A299 Gods of the upper world
      ...
'''

tmi_categories.iloc[1:6]

Unnamed: 0,motif,start,end,desc
1,A,100,499,GODS
2,A,100,199,The gods in general.
3,A,200,299,Gods of the upper world.
4,A,300,399,Gods of the underworld.
5,A,400,499,Gods of the earth.


In [31]:
# create list of motifs appeared in the ATU

# merge description
full = pd.DataFrame.from_dict(Counter(ut.flatten(list(result['full']))), orient='index').sort_values(by=0,ascending=False)
full.columns = ['freq']
full['description'] = [get_tmi_description(x) for x in full.index]
full['categories'] = [get_tmi_categories(x) for x in full.index]
full['desc'] = ['{}: '.format(i) + x['description'] for i,x in full.iterrows()]
full.head()

Unnamed: 0,freq,description,categories,desc
A2342.1,3,Why hare’s lip is split.,"(A2200-A2599, animal characteristics)",A2342.1: Why hare’s lip is split.
B296,3,Animals go a-journeying.,"(B200-B299, animals with human traits.)",B296: Animals go a-journeying.
K561.1,3,Animal captor persuaded to talk and release vi...,"(K500-K699, escape by deception.)",K561.1: Animal captor persuaded to talk and re...
K1161,2,Animals hidden in various parts of a house att...,"(K1000-K1199, deception into self-injury.)",K1161: Animals hidden in various parts of a ho...
K815,2,Victim lured by kind words approaches trickste...,"(K800-K999, killing or maiming by deception.)",K815: Victim lured by kind words approaches tr...


In [32]:
# unify by subcategories of the TMI

motif_desc = pd.concat([
    pd.DataFrame(full.groupby('categories')['freq'].sum()),
    pd.DataFrame(full.groupby('categories')['desc'].apply(list).apply(lambda x:sorted(x)).apply('; '.join))
], axis=1)

res = motif_desc.reset_index()
res['index'] = [x[0] for x in res['categories']]
res['category_name'] = [x[1] for x in res['categories']]
res = res.set_index('index')
res.head()

Unnamed: 0_level_0,categories,freq,desc,category_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1200-A1699,"(A1200-A1699, creation and ordering of human l...",1,A1321: Men and animals readjust span of life.,creation and ordering of human life
A1700-A2199,"(A1700-A2199, creation of animal life)",2,A1952: Creation of hoopoe.; A1965.2: Bittern f...,creation of animal life
A2200-A2599,"(A2200-A2599, animal characteristics)",30,A2211.2: Rabbit laughs: cause of hare-lip.; A2...,animal characteristics
A2700-A2799,"(A2700-A2799, origin of plant characteristics.)",2,A2741.1: Bean laughs till it splits: cause of ...,origin of plant characteristics.
B100-B199,"(B100-B199, magic animals.)",1,B103.0.4.1: Grateful snake gives gold piece da...,magic animals.


In [33]:
res[['category_name', 'freq','desc']].to_csv('result/motif_desc.tsv', sep='\t')

In [34]:
# deception-related motif

In [35]:
# deception-related motifs were obtained by using MOMFER(Karsdorp, van der Meulen, Meder, and van den Bosch,2015), a search engine of the TMI.
# we obtained these motifs by querying "deception or deceptive or deceive or cheat" on MOMFER and extracted indice from the result.

deceptives = pd.read_csv('data/deceptive_motifs.tsv',sep='\t', header=None)
deceptives.columns = ['motif']
deceptives = deceptives.set_index('motif', drop=False)

In [36]:
# size of list
len(deceptives.index)

1761

In [37]:
# merge description, motif categories, frequencies of the occurence in the corpus, etc.

deceptives['description'] = [get_tmi_description(x) for x in deceptives['motif']]
deceptives['categories'] = [get_tmi_categories(x) for x in deceptives['motif']]
deceptives['desc'] = ['{}: '.format(i) + x['description'] for i,x in deceptives.iterrows()]
motif_freq_dict = {i: v['freq'] for i,v, in full.iterrows()} 
deceptives['freq']  = [motif_freq_dict.get(x, 0) for x in deceptives.index]
deceptives['top_node'] = [x[0] for x in deceptives.index]

In [38]:
# count frequencies of deceptive-related motif by major classification of motif (i.e. single alphabet) 
# and count their frequencies in our corpus

res = pd.concat([
    pd.DataFrame(deceptives.groupby('top_node')['freq'].count()),
    pd.DataFrame(deceptives.groupby('top_node')['freq'].sum()),
    pd.DataFrame(deceptives[np.logical_not(deceptives['desc'].str.endswith('missing motif index)'))].groupby('top_node')['desc'].apply(list).apply(lambda x:sorted(x)).apply('; '.join))
], axis=1).reset_index()

res = res[[len(x) > 0 for x in res['top_node']]]
res['index'] = [x[0] for x in res['top_node']]
res = res.set_index('index')
res.columns = ['motif', 'item', 'present', 'desc']
res.head()

Unnamed: 0_level_0,motif,item,present,desc
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,A,104,2,A1006.1: New race from single pair (or several...
B,B,38,2,B103.2.1: Treasure-laying bird.; B11.3.5: Drag...
C,C,39,0,C141.1: Tabu: menstrous woman not to go near a...
D,D,151,0,D103: Assembly or group transformed to animals...
E,E,66,0,E1.1: Saint cut into pieces or decapitated com...


In [39]:
res[['item','present','desc']].to_csv('result/deception.tsv', sep='\t')

In [40]:
# deception-related motifs in our corpus(except for motif 'K')
deceptives[np.logical_and(deceptives['freq'] > 0, deceptives['top_node'] != 'K')]

Unnamed: 0_level_0,motif,description,categories,desc,freq,top_node
motif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2241,A2241,Animal characteristics: borrowing and not retu...,"(A2200-A2599, animal characteristics)",A2241: Animal characteristics: borrowing and n...,1,A
A2251.1,A2251.1,Ant carries load as heavy as himself.,"(A2200-A2599, animal characteristics)",A2251.1: Ant carries load as heavy as himself.,1,A
B271.3,B271.3,Animals ring bell and demand justice.,"(B200-B299, animals with human traits.)",B271.3: Animals ring bell and demand justice.,1,B
B296.1,B296.1,Animal journeys to Rome.,"(B200-B299, animals with human traits.)",B296.1: Animal journeys to Rome.,1,B
H1541.1,H1541.1,Contest in enduring cold: frost and the hare.,"(H1400-H1599, other tests)",H1541.1: Contest in enduring cold: frost and t...,1,H
J815.1,J815.1,Liar rewarded by the apes.,"(J200-J1099, wise and unwise conduct)",J815.1: Liar rewarded by the apes.,1,J
L315.6,L315.6,Insects worry large animal to despair or death.,"(L300-L399, triumph of the weak.)",L315.6: Insects worry large animal to despair ...,1,L


In [41]:
deceptives[np.logical_and(deceptives['freq'] > 0, deceptives['top_node'] != 'K')].to_csv('result/deception_motif_without_k.tsv', sep='\t')