In [3]:
library('ggplot2')
library('plyr')
library('tidyr')
library('parallel')
library('lme4')
library("scales")
source('freq-vs-info-content_helper.R')
source('config.R')

In [None]:
# Equivalent to Cross-Linguistic Correlations in Cross-Linguistic PIC (1T, Books 2012, OPUS, BNC)

In [4]:
languages = c('ENGLISH',"SPANISH",'FRENCH', 'DUTCH', 'GERMAN','SWEDISH','CZECH','POLISH','ROMANIAN','ITALIAN','PORTUGUESE')
withoutEndMarker = lapply(languages, function(language){getScoresForLanguage(
   paths[['MG_Google1T']], 'Google1T','25000',language, endMarker=T,  corMethod='spearman', opusFilter=F)})

[1] "original number in ENGLISH: 49905"
[1] "Limiting analysis"
[1] "filtered number in ENGLISH: 25000"
[1] "Getting correlations"
 [1] "word"                      "mean_surprisal_weighted"  
 [3] "mean_surprisal_unweighted" "frequency"                
 [5] "numContexts"               "retrievalTime"            
 [7] "index"                     "ipa"                      
 [9] "ipa_ss_array"              "ipa_ss"                   
[11] "ipa_n"                     "ortho"                    
[13] "ortho_ss_array"            "ortho_ss"                 
[15] "ortho_n"                   "character"                
[17] "character_ss_array"        "character_ss"             
[19] "character_n"               "endMarker"                
[21] "trigramSurprisal"          "prob"                     
[23] "unigramSurprisal"          "log_ipa_n"                
[25] "log_character_n"           "language"                 
[1] "At the dictionary check"
[1] "languageCode: NA"
[1] "original number in

In [None]:
g1t_corpusCounts = do.call('rbind',lapply(withoutEndMarker, function(language){
    numTokens = sum(as.numeric(language[['df']]$frequency), na.rm=T)
    numTokens_pretty = format(numTokens, big.mark=",", scientific=F)
    analysisCount = nrow(language[['df']])
    language= language[['df']]$language[1]
    return(data.frame(language, numTokens, numTokens_pretty, analysisCount, stringsAsFactors=F))
}))
g1t_corpusCounts

In [None]:
source('ss_analysis.R')
nem = do.call('rbind', lapply(withoutEndMarker, function(x){x$score}))
nem$xv = expandColNames(nem$xv)
nem$yv = expandColNames(nem$yv)
#Better word-level metric names


nem = merge(nem,g1t_corpusCounts, by='language')
nem$language = as.factor(sapply(nem$language, simpleCap))
nem$languageCount = sapply(1:length(nem$language), function(x){paste0(nem$language[x], ' (', round(nem$numTokens[x] / 1000000000,0),'b)' )})
nem$yv_xv = paste(nem$yv, nem$xv, sep='-')

In [None]:
books2012_languages = c('eng-all',"spa-all",'fre-all','ger-all','rus-all','heb-all', 'ita-all')
books2012_withoutEndMarker = lapply(books2012_languages, function(language){getScoresForLanguage('/shared_hd2/models/ss/noFilter_25Feb','GoogleBooks2012','25000', language, endMarker=T,  corMethod='spearman', contextLimit=0, opusFilter=F)})
names(books2012_withoutEndMarker) = books2012_languages

In [None]:
gb12_corpusCounts = do.call('rbind',lapply(books2012_withoutEndMarker, function(language){
    numTokens = sum(as.numeric(language[['df']]$frequency), na.rm=T)
    numTokens_pretty = format(numTokens, big.mark=",", scientific=F)
    analysisCount = nrow(language[['df']])
    language = language[['df']]$language[1]
    return(data.frame(language, numTokens, numTokens_pretty, analysisCount, stringsAsFactors=F))
}))
gb12_corpusCounts$language = sapply(gb12_corpusCounts$language, function(x){
    gb12_langRemapper[[x]]
})
gb12_corpusCounts

In [None]:
books2012.nem = do.call('rbind', lapply(books2012_withoutEndMarker, function(x){x$score}))
books2012.nem$xv = expandColNames(books2012.nem$xv)
books2012.nem$yv = expandColNames(books2012.nem$yv)
#Better word-level metric names

books2012.nem$language = sapply(books2012.nem$language, function(x){
    gb12_langRemapper[[x]]
})

books2012.nem = merge(books2012.nem,gb12_corpusCounts, by='language')
books2012.nem$language = factor(books2012.nem$language)
books2012.nem$languageCount = sapply(1:length(books2012.nem$language), function(x){paste0(books2012.nem$language[x], ' (', round(books2012.nem$numTokens[x] / 1000000000,0),'b)' )})
books2012.nem$yv_xv = paste(books2012.nem$yv, books2012.nem$xv, sep='-')

In [None]:
opus_languages = c('en','ru','he','de','fr','es','cs','pt', 'pl','ro','sv','it','nl')
opus_withoutEndMarker = lapply(1:length(opus_languages), function(i){getScoresForLanguage('/shared_hd2/models/ss/noFilter_25Feb','OPUS','25000',opus_languages[i], endMarker=T,  corMethod='spearman', opusFilter=F)})
names(opus_withoutEndMarker) = opus_languages

In [None]:
opus_corpusCounts = do.call('rbind',lapply(opus_withoutEndMarker, function(language){
    numTokens = sum(as.numeric(language[['df']]$frequency), na.rm=T)
    numTokens_pretty = format(numTokens, big.mark=",", scientific=F)
    analysisCount = nrow(language[['df']])
    language= language[['df']]$language[1]
    return(data.frame(language, numTokens, numTokens_pretty, analysisCount, stringsAsFactors=F))
}))
opus_corpusCounts

In [None]:
opus.nem = do.call('rbind', lapply(opus_withoutEndMarker, function(x){x$score}))
opus.nem$xv = expandColNames(opus.nem$xv)
opus.nem$yv = expandColNames(opus.nem$yv)
#Better word-level metric names
opus.nem = merge(opus.nem,opus_corpusCounts, by='language')
opus.nem$language = as.factor(sapply(opus.nem$language, simpleCap))
books2012.nem$yv_xv = paste(books2012.nem$yv, books2012.nem$xv, sep='-')
opus.nem$languageCount = sapply(1:length(opus.nem$language), function(x){paste0(opus.nem$language[x], ' (', round(opus.nem$numTokens[x] / 1000000,0),'m)' )})

In [None]:
opus_datasets = do.call('rbind',lapply(opus_withoutEndMarker, function(x){
    rdf = x$score
    rdf$language = x$df$language[1]    
    return(rdf)
}))
opus_datasets$dataset = 'OPUS'                      
                        
gb12_datasets = do.call('rbind',lapply(books2012_withoutEndMarker, function(x){
    rdf = x$score
    rdf$language = x$df$language[1]    
    return(rdf)
}))
gb12_datasets$dataset = 'GB12'

g1t_datasets = do.call('rbind',lapply(withoutEndMarker, function(x){
    rdf = x$score
    rdf$language = x$df$language[1]    
    return(rdf)
}))
g1t_datasets$dataset = 'G1T'

In [None]:
combined_datasets = rbind.fill(opus_datasets, gb12_datasets, g1t_datasets)
combined_datasets$dataset_language = paste0(combined_datasets$dataset, ': ', combined_datasets$language)

nem$languageLabel = paste('1T:', nem$languageCount)
books2012.nem$languageLabel = paste('GB12:', books2012.nem$languageCount)
opus.nem$languageLabel = paste('OPUS:', opus.nem$languageCount)
nem$dataset = '1T'
books2012.nem$dataset = 'GB12'
opus.nem$dataset = 'OPUS'
nem_combined = rbind.fill(nem, books2012.nem, opus.nem)


In [None]:
source('ss_analysis.R')
xv_list = c("Number of Characters")
yv_list = c("Unigram Surprisal",'Trigram Surprisal')
col_yellow = "#E69F00" # trigram ~ nchar
col_gray = "#8a8a8a" # trigram ~ ss

dodge <- position_dodge(width=0.9)
correlations = subset(nem_combined, xv %in% xv_list & yv %in% yv_list & !residualize & dataset
!= "OPUS")
partial_correlations = subset(nem_combined, xv %in% xv_list & yv %in% yv_list & 
residualize & dataset != "OPUS")
p_values = subset(nem_combined, xv %in% xv_list & yv == "Difference: Unigram Surprisal - Trigram Surprisal")
p_values$sig = mapply(sig_symbol, p_values$pvalue, p_values$direction)
p_values$direction[p_values$sig =='ns'] = 'higher' # makes ns red, b/c inconsistent with P11

crosslinguistic_predict_len = ggplot(correlations 
) + geom_bar(aes(x=languageLabel, y=singleCor, fill=yv),  stat='identity', position='dodge'
)   + scale_fill_manual( values=c(col_gray, col_yellow)
) + theme(axis.text.x = element_text(angle = 45, hjust = 1)
) + theme(panel.background = element_blank(
), axis.text.x = element_text(colour = "black"),axis.text.y = element_text(
 colour = "black"), axis.line.x = element_line(color="black", size = .25),
    axis.line.y = element_line(color="black", size = .25)) + geom_hline(aes(yintercept = 0), 
linetype = 'dashed') + ylab(expression(paste("Spearman's ",rho))) + xlab(''
) +  theme(legend.position="none")  + geom_point(aes(x= languageLabel, y=-.1, size=numTokens)
) + scale_size_area(
) + geom_hline(aes(yintercept = -.05), size=.25) +  geom_linerange(aes(x=languageLabel, 
ymin=lower, ymax=upper, fill=yv), position=dodge, colour='gray'
) + geom_errorbar(data = subset(partial_correlations, dataset != 'OPUS'), aes(x=languageLabel,
 ymin=singleCor, ymax=singleCor, fill=yv), position='dodge') +  geom_linerange(data=subset(
partial_correlations, dataset != 'OPUS'), aes(
 x=languageLabel, ymin=lower, ymax=upper, fill=yv), position=dodge, colour='grey45'
) + geom_text(data=subset(p_values, direction == 'lower' & !residualize & dataset
    != "OPUS"), aes(x=languageLabel, y=.35, label=sig)                                                                                            
) + geom_text(data=subset(p_values, direction == 'higher' & !residualize & dataset 
    !="OPUS"), aes(x=languageLabel, y=.35, label=sig, colour='red')
) + geom_text(data=subset(p_values, direction == 'lower' & residualize & dataset 
    != "OPUS"), aes(x=languageLabel, y=-.02, label=sig)
) + geom_text(data=subset(p_values, direction == 'higher' & residualize & dataset
    != "OPUS"), aes(x=languageLabel, y=-.02, label=sig), colour='red')



pdf('figures/crosslinguistic-unigramTrigramLength_noFilter.pdf', width=10, height=5)
print(crosslinguistic_predict_len)
dev.off()

options(repr.plot.width=10, repr.plot.height=6, jupyter.plot_mimetypes = 'image/png')
print(crosslinguistic_predict_len)

In [None]:
# check what is actually in the English dataset. Are they dictionary filtered?
withoutEndMarker[[1]]$df$word

In [None]:
#these look likely to be dictionary filtered

aspell(c('ssh','fish','glarp'),languageCode='en')