In [1]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats

sys.path.insert(1, os.path.join(sys.path[0], '..'))
from h06_results.plot_correlations import read_results
from util import constants
from util import util

# Choose a Language and a Seed to Analyse

In [2]:
language = 'fi'
seed = 0

# Get File Paths

In [3]:
checkpoints_path = '../../checkpoint'
results_path = os.path.join(checkpoints_path, language, 'seed_%02d' % seed)

results_freq_codes_file = os.path.join(results_path, 'codes.tsv')
results_ent_natural_file = os.path.join(results_path, 'compiled_natural_polysemy.tsv')
results_ent_polyassign_file = os.path.join(results_path, 'compiled_polyassign_polysemy.tsv')

# Load Data

In [4]:
def rename_iid_columns(df):
    df['iid'] = df['caplan']
    df['iid_length'] = df['caplan_length']
    del df['caplan']
    del df['caplan_length']
    
    return df

In [5]:
# Get natural and fcfs code lengths
df_freq = pd.read_csv(results_freq_codes_file, sep='\t')
del df_freq['Unnamed: 0']
df_freq = rename_iid_columns(df_freq)

In [6]:
# Get polysemy results
df_nat = pd.read_csv(results_ent_natural_file, sep='\t')
df_polyassign = pd.read_csv(results_ent_polyassign_file, sep='\t')

df_nat = rename_iid_columns(df_nat)
df_polyassign = rename_iid_columns(df_polyassign)

# Experiment 1. Natural, FCFS, and IID Lexicons

In [7]:
df_freq.head(10)

Unnamed: 0,idx,natural_length,frequencies,natural,fcfs,fcfs_length,iid,iid_length
0,0,3,10504,hän,kykyen,6,kykyen,6
1,1,3,12425,oli,rakennelta,10,rakennelta,10
2,2,7,322,ranskan,poliikkevalta,13,poliikkevalta,13
3,3,9,122,kuninkaan,alkuperäistä,12,alkuperäistä,12
4,4,6,75,kaarle,keskettää,9,keskettää,9
5,5,3,14,vin,suuruuten,9,suuruuten,9
6,6,5,129,tytär,maatal,6,maatal,6
7,7,9,177,englannin,rakennus,8,rakennus,8
8,8,6,94,henrik,tutkimuksen,11,tutkimuksen,11
9,9,2,11,vn,tarkoitettu,11,tarkoitettu,11


In [8]:
nat_corr, _ = stats.spearmanr(df_freq['frequencies'], df_freq['natural_length'])
fcfs_corr, _ = stats.spearmanr(df_freq['frequencies'], df_freq['fcfs_length'])
iid_corr, _ = stats.spearmanr(df_freq['frequencies'], df_freq['iid_length'])

print(f'Natural Frequency--length Correlation: {nat_corr:.4f}')
print(f'Natural Frequency--length Correlation: {fcfs_corr:.4f}')
print(f'Natural Frequency--length Correlation: {iid_corr:.4f}')

Natural Frequency--length Correlation: -0.2524
Natural Frequency--length Correlation: -0.0595
Natural Frequency--length Correlation: 0.0008


# Experiment 1. PolyFCFS, and PolyIID Lexicons

In [9]:
df_polyassign.head(10)

Unnamed: 0,idx,poly_var,poly_cov,length,frequencies,natural,fcfs,natural_length,fcfs_length,iid,iid_length
0,0,168.074369,144.139454,1,6045,0,kykyen,1,6,kykyen,6
1,1,154.127252,113.391454,1,1014,1,rakennelta,1,10,rakennelta,10
2,2,148.388656,57.85951,1,158,2,poliikkevalta,1,13,poliikkevalta,13
3,3,135.70621,-1701.413245,1,29,3,alkuperäistä,1,12,alkuperäistä,12
4,4,155.475508,92.801957,1,469,4,keskettää,1,9,keskettää,9
5,5,147.370475,99.808912,1,487,5,suuruuten,1,9,suuruuten,9
6,6,113.858547,-1092.923677,1,55,6,maatal,1,6,maatal,6
7,7,28.949735,-2449.426363,1,3,7,rakennus,1,8,rakennus,8
8,8,55.051564,-2420.971246,1,3,8,tutkimuksen,1,11,tutkimuksen,11
9,9,162.186618,156.963484,1,125524,9,tarkoitettu,1,11,tarkoitettu,11


In [10]:
polyfcfs_corr, _ = stats.spearmanr(df_polyassign['frequencies'], df_polyassign['fcfs_length'])
polyiid_corr, _ = stats.spearmanr(df_polyassign['frequencies'], df_polyassign['iid_length'])

print(f'Natural Frequency--length Correlation: {polyfcfs_corr:.4f}')
print(f'Natural Frequency--length Correlation: {polyiid_corr:.4f}')

Natural Frequency--length Correlation: -0.0442
Natural Frequency--length Correlation: 0.0006


# Experiment 2. Natural, FCFS, and IID Lexicons

In [11]:
df_nat.head(10)

Unnamed: 0,idx,poly_var,poly_cov,length,frequencies,natural,fcfs,natural_length,fcfs_length,iid,iid_length
0,0,142.951484,108.065371,3,9712,hän,kykyen,3,6,kykyen,6
1,1,157.193948,135.858831,3,11487,oli,rakennelta,3,10,rakennelta,10
2,2,104.42431,59.044547,7,298,ranskan,poliikkevalta,7,13,poliikkevalta,13
3,3,95.143499,11.931314,9,116,kuninkaan,alkuperäistä,9,12,alkuperäistä,12
4,4,93.261156,-744.991479,6,70,kaarle,keskettää,6,9,keskettää,9
5,5,131.156149,-2079.713529,3,14,vin,suuruuten,3,9,suuruuten,9
6,6,116.857715,28.862158,5,117,tytär,maatal,5,6,maatal,6
7,7,101.000441,33.097346,9,167,englannin,rakennus,9,8,rakennus,8
8,8,102.302215,-352.941613,6,86,henrik,tutkimuksen,6,11,tutkimuksen,11
9,9,88.072211,-2196.644118,2,11,vn,tarkoitettu,2,11,tarkoitettu,11


In [12]:
nat_corr, _ = stats.spearmanr(df_nat['poly_cov'], df_nat['natural_length'])
fcfs_corr, _ = stats.spearmanr(df_nat['poly_cov'], df_nat['fcfs_length'])
iid_corr, _ = stats.spearmanr(df_nat['poly_cov'], df_nat['iid_length'])

print(f'Natural Frequency--length Correlation: {nat_corr:.4f}')
print(f'Natural Frequency--length Correlation: {fcfs_corr:.4f}')
print(f'Natural Frequency--length Correlation: {iid_corr:.4f}')

Natural Frequency--length Correlation: -0.2574
Natural Frequency--length Correlation: -0.0611
Natural Frequency--length Correlation: -0.0002


# Experiment 2. PolyFCFS, and PolyIID Lexicons

In [13]:
df_polyassign.head(10)

Unnamed: 0,idx,poly_var,poly_cov,length,frequencies,natural,fcfs,natural_length,fcfs_length,iid,iid_length
0,0,168.074369,144.139454,1,6045,0,kykyen,1,6,kykyen,6
1,1,154.127252,113.391454,1,1014,1,rakennelta,1,10,rakennelta,10
2,2,148.388656,57.85951,1,158,2,poliikkevalta,1,13,poliikkevalta,13
3,3,135.70621,-1701.413245,1,29,3,alkuperäistä,1,12,alkuperäistä,12
4,4,155.475508,92.801957,1,469,4,keskettää,1,9,keskettää,9
5,5,147.370475,99.808912,1,487,5,suuruuten,1,9,suuruuten,9
6,6,113.858547,-1092.923677,1,55,6,maatal,1,6,maatal,6
7,7,28.949735,-2449.426363,1,3,7,rakennus,1,8,rakennus,8
8,8,55.051564,-2420.971246,1,3,8,tutkimuksen,1,11,tutkimuksen,11
9,9,162.186618,156.963484,1,125524,9,tarkoitettu,1,11,tarkoitettu,11


In [14]:
polyfcfs_corr, _ = stats.spearmanr(df_polyassign['poly_cov'], df_polyassign['fcfs_length'])
polyiid_corr, _ = stats.spearmanr(df_polyassign['poly_cov'], df_polyassign['iid_length'])

print(f'Natural Frequency--length Correlation: {polyfcfs_corr:.4f}')
print(f'Natural Frequency--length Correlation: {polyiid_corr:.4f}')

Natural Frequency--length Correlation: -0.0440
Natural Frequency--length Correlation: 0.0010
