# **Generating a complete dataset**
This notebook generates a file named `dataset_P_L25.csv` in the Google drive of the logged account. Please, change the path in last cell, if you do not want to modify your Google drive.  

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
!pip install rpy2==3.5.1

Collecting rpy2==3.5.1
  Downloading rpy2-3.5.1.tar.gz (201 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/201.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m194.6/201.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.7/201.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rpy2
  Building wheel for rpy2 (setup.py) ... [?25l[?25hdone
  Created wheel for rpy2: filename=rpy2-3.5.1-cp310-cp310-linux_x86_64.whl size=318086 sha256=a4c9b45e1517983a7d14d95c9a29349b9120c064da324b9d7426bd6a02bf6744
  Stored in directory: /root/.cache/pip/wheels/73/a6/ff/4e75dd1ce1cfa2b9a670cbccf6a1e41c553199e9b25f05d953
Successfully built rpy2
Installing collected packages: rpy2
  Attempting uninstall: rpy2
    Found existing installation: rpy2 3.5.5


In [None]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [None]:
# download semrelations.zip file with all languages and unzip
!gdown '1RH8U3TGbUtSuSHk32byeQ2IJ4vn850WT'

# if it does not work the above link, uncomment the next line
#!wget --load-cookies /tmp/cookies.txt "https://drive.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://drive.google.com/uc?export=download&id=1RH8U3TGbUtSuSHk32byeQ2IJ4vn850WT' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1RH8U3TGbUtSuSHk32byeQ2IJ4vn850WT" -O results.zip && rm -rf /tmp/cookies.txt

!unzip -o semrelations.zip

Downloading...
From: https://drive.google.com/uc?id=1RH8U3TGbUtSuSHk32byeQ2IJ4vn850WT
To: /content/semrelations.zip
100% 477M/477M [00:10<00:00, 46.1MB/s]
Archive:  semrelations.zip
  inflating: semrelations.csv        


### **Load the semrelations.csv dataset**
*semrelations.csv* is a csv text file containing in each line:
- **subject**: wikidata id for a concept.
- **object**: wikidata id for a concept.
- **relation_type**: the relation name between subject and object concepts. One of "antonym for", "holonym for", "hyperonym for", "hyponym for", "meronym for" and "random".
- **property**: wikidata id for the property that supports the relation type betweeen the concepts.
- **[id]_label_subject** and **[id]_label_object**: translation in $40$ languages (where *id* is the ISO language identifier) for the subject and object concepts. If an *\N* appears, it means that there is no label (translation) for that idiom. The labels for the subject and the object concepts are both equal to \N or both not \N.

In [None]:
%%R
data_rels = read.csv("semrelations.csv", encoding = "utf-8", na.strings = "\\N")
data_rels$relation_type = as.factor(data_rels$relation_type)

In [None]:
%%R
nrow(data_rels)

[1] 2605088


##**Filtering**
###**0. Find a *complete* dataset for $N$ languages**
A dataset is said to be *complete for $N$ languages* if both the subject and the object concepts have a translation for $N$ languages.

In [None]:
%%R
columns = names(data_rels)
languages = columns[seq(5,length(columns),1)]
total_per_lang = sapply(languages, function(c) sum(complete.cases(data_rels[,c])))
total_per_lang = sort(total_per_lang, decreasing = T)

total_complete = sapply(1:length(total_per_lang),
       function(i) sum(complete.cases(data_rels[,names(total_per_lang)[1:i]])))

In [None]:
%%R
total_per_lang_viz = as.data.frame(total_per_lang)
langs = row.names(total_per_lang_viz)
langs = sapply(seq(1,length(langs),2), function(i) gsub("_.*","",langs[i]))
total_per_lang_viz = total_per_lang_viz[seq(1,nrow(total_per_lang_viz),2),,drop=F]
row.names(total_per_lang_viz) = paste(langs, 1:length(langs), sep = "_")

per_rel = sapply(names(total_per_lang), function(n) {table(data_rels[complete.cases(data_rels[,n]),c("relation_type",n)]$relation_type)})

per_rel = per_rel[,seq(1,ncol(per_rel),2)]
total_per_lang_viz = cbind(total_per_lang_viz, t(per_rel))
perc_columns = paste(names(total_per_lang_viz[-1]), "%")
perc_cols = as.data.frame(apply(total_per_lang_viz,1, function(r) r[-1]/r[1]))
row.names(perc_cols) = perc_columns

total_per_lang_viz =cbind(total_per_lang_viz, t(perc_cols))

In [None]:
%R -o total_per_lang_viz
total_per_lang_viz

Unnamed: 0,total_per_lang,antonym for,holonym for,hyponym for,meronym for,random,antonym for %,holonym for %,hyponym for %,meronym for %,random %
en_1,1759788,8159,74145,1025225,63920,588339,0.004636,0.042133,0.582584,0.036323,0.334324
fr_2,1023127,4633,34904,373639,22357,587594,0.004528,0.034115,0.365193,0.021852,0.574312
de_3,901217,4399,26055,267670,15832,587261,0.004881,0.028911,0.297009,0.017567,0.651631
nl_4,890125,3385,25203,259261,14685,587591,0.003803,0.028314,0.291264,0.016498,0.660122
es_5,880311,4444,25223,247108,15299,588237,0.005048,0.028652,0.280705,0.017379,0.668215
ru_6,812811,4764,25283,179149,15158,588457,0.005861,0.031106,0.220407,0.018649,0.723978
it_7,800372,3094,20545,177609,11958,587166,0.003866,0.025669,0.221908,0.014941,0.733616
ja_8,793505,4293,25282,164464,14368,585098,0.00541,0.031861,0.207263,0.018107,0.737359
ca_9,788306,3782,18576,166578,12393,586977,0.004798,0.023564,0.211311,0.015721,0.744606
zh_10,751241,3208,19576,131328,11186,585943,0.00427,0.026058,0.174815,0.01489,0.779967


In [None]:
%%R
# Get a "complete" dataset with TOTAL_LANGUAGES languages (gready heuristic)

TOTAL_LANGUAGES = 25
ind = 2*TOTAL_LANGUAGES

complete = complete.cases(data_rels[,names(total_per_lang)[1:ind]])
data_filt_0 = data_rels[complete,c("subject","object","relation_type", "property", names(total_per_lang)[1:ind])]

total_rel_0 = sapply(levels(data_filt_0$relation_type), function (r) sum(data_filt_0$relation == r))
stats_dataset_0 = as.data.frame(total_rel_0)
stats_dataset_0$percentage = stats_dataset_0$total_rel/sum(total_rel_0)
print(total_rel_0)
print(sum(total_rel_0))

antonym for holonym for hyponym for meronym for      random 
        335        1873        5819        1042      412893 
[1] 421962


In [None]:
# move R variable to python
%R -o stats_dataset_0
stats_dataset_0

Unnamed: 0,total_rel_0,percentage
antonym for,335,0.000794
holonym for,1873,0.004439
hyponym for,5819,0.01379
meronym for,1042,0.002469
random,412893,0.978508


####**Check digit occurrences**

In [None]:
%%R
digits_re = "[0-9]+"

digits_indx = apply(data_filt_0[,5:ncol(data_filt_0)], 2, function(c) grepl(digits_re, c))

In [None]:
%%R
total_digits_per_lang = apply(digits_indx, 2, sum)
total_digits_per_lang

en_label_subject  en_label_object fr_label_subject  fr_label_object 
               0                0                0                0 
de_label_subject  de_label_object nl_label_subject  nl_label_object 
               0                0                0                0 
es_label_subject  es_label_object ru_label_subject  ru_label_object 
               0                0                0                0 
it_label_subject  it_label_object ja_label_subject  ja_label_object 
               0                0                0                0 
ca_label_subject  ca_label_object zh_label_subject  zh_label_object 
               0                0                0                0 
pt_label_subject  pt_label_object ar_label_subject  ar_label_object 
               0                0                0                0 
sv_label_subject  sv_label_object pl_label_subject  pl_label_object 
               0                0                0                0 
uk_label_subject  uk_label_object 

###**1. Remove one character concepts**
It is used English to check it.

In [None]:
%%R
# remove one character
one_char_indx_subj = grepl("^.$", data_filt_0[,5])
one_char_indx_obj = grepl("^.$", data_filt_0[,6])
print(sum(one_char_indx_subj))
print(sum(one_char_indx_obj))

one_char_concepts = unique(c(data_filt_0[one_char_indx_subj,5],data_filt_0[one_char_indx_obj,6]))
print(length(one_char_concepts))
print(one_char_concepts)

[1] 1106
[1] 767
[1] 15
 [1] "↓"  "T"  "ª"  "º"  "ڧ"  "᠁"  "⋰"  "٭"  "⋮"  "◊"  "‐"  "⋯"  "︙" "⋱"  "…" 


In [None]:
%%R
data_filt_1 = data_filt_0[!one_char_indx_subj & !one_char_indx_obj,]
one_char_indx = grepl("^.$", c(data_filt_1[,5],data_filt_1[,6]))
sum(one_char_indx)

[1] 0


In [None]:
%%R
total_rel_1 = sapply(levels(data_filt_1$relation_type), function (r) sum(data_filt_1$relation == r))
print(total_rel_1)
print(sum(total_rel_1))

antonym for holonym for hyponym for meronym for      random 
        335        1873        5809        1041      411033 
[1] 420091


###**2. Remove concepts with ')' at the end**
If a label for a language holds the above condition (it contains a ')' at the end), then the regarding relation between the involved concepts is pruned. In other words, the regarding line in the semrelations.csv file is removed, and so, the relation will be removed for all languages. This prune method will be used for the next filters.

In [None]:
%%R
par_end_re = "[\\)]\\s*$"

par_end_indx = apply(data_filt_1[,5:ncol(data_filt_1)], 2, function(c) grepl(par_end_re, c))


In [None]:
%%R
all_par_end_indx = rep(F, nrow(par_end_indx))
for (c in 1:ncol(par_end_indx)) {
    all_par_end_indx = all_par_end_indx |  par_end_indx[,c]
}

total_par_per_lang = apply(par_end_indx, 2, sum)

print(sum(all_par_end_indx))
print(total_par_per_lang)

[1] 26625
en_label_subject  en_label_object fr_label_subject  fr_label_object 
               0                0                0                0 
de_label_subject  de_label_object nl_label_subject  nl_label_object 
             346              167                1                0 
es_label_subject  es_label_object ru_label_subject  ru_label_object 
             230                0              563              249 
it_label_subject  it_label_object ja_label_subject  ja_label_object 
             273               83             1005              656 
ca_label_subject  ca_label_object zh_label_subject  zh_label_object 
               1                0              183              159 
pt_label_subject  pt_label_object ar_label_subject  ar_label_object 
             167              351              491              800 
sv_label_subject  sv_label_object pl_label_subject  pl_label_object 
             279              235              412              176 
uk_label_subject  uk_lab

In [None]:
%%R
examples_par = unique(data_filt_1[par_end_indx[,9],4+9])
print(length(examples_par))
examples_par

[1] 2
[1] "óxido de titanio(IV)"                 
[2] "ciudad imperial libre (Sacro Imperio)"


In [None]:
%%R
data_filt_2 = data_filt_1[!all_par_end_indx,]
print(nrow(data_filt_2))

[1] 393466


In [None]:
%%R
total_rel_2 = sapply(levels(data_filt_2$relation_type), function (r) sum(data_filt_2$relation == r))
print(total_rel_2)
print(sum(total_rel_2))

antonym for holonym for hyponym for meronym for      random 
        313        1761        5464         972      384956 
[1] 393466


###**3. Text in parentheses followed by a non blank**

In [None]:
%%R
par2_re = "\\([^\\)]+\\)\\S"

par2_indx = apply(data_filt_2[,5:ncol(data_filt_2)], 2, function(c) grepl(par2_re, c))

In [None]:
%%R
total_par2_indx = apply(par2_indx, 2, sum)
print(total_par2_indx)

en_label_subject  en_label_object fr_label_subject  fr_label_object 
              98              239               96              239 
de_label_subject  de_label_object nl_label_subject  nl_label_object 
             317              400                1              102 
es_label_subject  es_label_object ru_label_subject  ru_label_object 
               0                0                0                0 
it_label_subject  it_label_object ja_label_subject  ja_label_object 
               0                0                0                0 
ca_label_subject  ca_label_object zh_label_subject  zh_label_object 
               0                0                0                0 
pt_label_subject  pt_label_object ar_label_subject  ar_label_object 
               0                0                0                0 
sv_label_subject  sv_label_object pl_label_subject  pl_label_object 
               0                0              317              400 
uk_label_subject  uk_label_object 

In [None]:
%%R
unique(data_filt_2[par2_indx[,2],6])

[1] "(−)-nicotine"   "(RS)-methadone"


In [None]:
%%R
all_par2_indx= rep(F, nrow(par2_indx))
for (c in 1:ncol(par2_indx)) {
    all_par2_indx = all_par2_indx | par2_indx[,c]
}
print(sum(all_par2_indx & data_filt_2$relation_type == "random"))
print(sum(all_par2_indx))

[1] 805
[1] 820


In [None]:
%%R
data_filt_3 = data_filt_2[!all_par2_indx,]
print(nrow(data_filt_3))

[1] 392646


In [None]:
%%R
total_rel_3 = sapply(levels(data_filt_3$relation_type), function (r) sum(data_filt_3$relation == r))
print(total_rel_3)
print(sum(total_rel_3))

antonym for holonym for hyponym for meronym for      random 
        313        1757        5453         972      384151 
[1] 392646


###**4. Lowercase+capital letter**

In [None]:
%%R
lowup_re = "\\p{Ll}\\p{Lu}"
lowup_indx = apply(data_filt_3[,5:ncol(data_filt_3)], 2, function(c) grepl(lowup_re, c, perl=TRUE))

In [None]:
%%R
low_up_concepts = unique(c(data_filt_3[lowup_indx[,1], 5], data_filt_3[lowup_indx[,2], 6]))
print(length(low_up_concepts))
low_up_concepts

[1] 26
 [1] "PlayStation Portable"                
 [2] "wide area networkICR Co data base"   
 [3] "iOS"                                 
 [4] "Nintendo GameCube"                   
 [5] "OpenDocument"                        
 [6] "WikiProject"                         
 [7] "openSUSE"                            
 [8] "PlayStation"                         
 [9] "BitTorrent"                          
[10] "pH"                                  
[11] "iPod"                                
[12] "LaTeX"                               
[13] "iPod Touch"                          
[14] "HyperText Markup Language"           
[15] "ORCID iD"                            
[16] "MacBook Pro"                         
[17] "BlackBerry"                          
[18] "macOS"                               
[19] "iPad"                                
[20] "YouTuber"                            
[21] "iPhone"                              
[22] "Extensible HyperText Markup Language"
[23] "PlayStation Vita"  

In [None]:
%%R
total_lowup_indx = apply(lowup_indx, 2, sum)
print(total_lowup_indx)

en_label_subject  en_label_object fr_label_subject  fr_label_object 
            1595             1760             1116             1215 
de_label_subject  de_label_object nl_label_subject  nl_label_object 
            1709             1161             1424             1378 
es_label_subject  es_label_object ru_label_subject  ru_label_object 
            1410             1538             1433             1300 
it_label_subject  it_label_object ja_label_subject  ja_label_object 
            1203             1138             1366             1224 
ca_label_subject  ca_label_object zh_label_subject  zh_label_object 
            1133              974             1147              974 
pt_label_subject  pt_label_object ar_label_subject  ar_label_object 
            1410             1292                0                0 
sv_label_subject  sv_label_object pl_label_subject  pl_label_object 
             768              396             1452             1137 
uk_label_subject  uk_label_object 

In [None]:
%%R
all_lowup_indx= rep(F, nrow(lowup_indx))
for (c in 1:ncol(par2_indx)) {
    all_lowup_indx = all_lowup_indx |  lowup_indx[,c]
}
print(sum(all_lowup_indx & data_filt_3$relation_type == "random"))
print(sum(all_lowup_indx))

[1] 5138
[1] 5171


In [None]:
%%R
data_filt_4 = data_filt_3[!all_lowup_indx, ]

In [None]:
%%R
total_rel_4 = sapply(levels(data_filt_4$relation_type), function (r) sum(data_filt_4$relation == r))
print(total_rel_4)
print(sum(total_rel_4))

antonym for holonym for hyponym for meronym for      random 
        313        1753        5427         969      379013 
[1] 387475


###**5. *Funny* characters**

In [None]:
%%R
rare_content_re = "\\]|\\[|[/\\,.:<>!¡¿?&%#@|?¿;=$\\(\\){}]"

rare_content_indx = apply(data_filt_4[,5:ncol(data_filt_4)], 2, function(c) grepl(rare_content_re, c))

total_content_indx= apply(rare_content_indx, 2, sum)
print(total_content_indx)

en_label_subject  en_label_object fr_label_subject  fr_label_object 
             177              436              306               89 
de_label_subject  de_label_object nl_label_subject  nl_label_object 
             335              174              489              424 
es_label_subject  es_label_object ru_label_subject  ru_label_object 
             163              172              159              720 
it_label_subject  it_label_object ja_label_subject  ja_label_object 
             156              161              475                1 
ca_label_subject  ca_label_object zh_label_subject  zh_label_object 
             234              417               91              331 
pt_label_subject  pt_label_object ar_label_subject  ar_label_object 
             170              498              233                2 
sv_label_subject  sv_label_object pl_label_subject  pl_label_object 
             956              817              675              919 
uk_label_subject  uk_label_object 

In [None]:
%%R
funny_concepts = unique(c(data_filt_4[rare_content_indx[,9], 4+9], data_filt_4[rare_content_indx[,10], 4+10]))
print(length(funny_concepts))
head(funny_concepts, 20)

[1] 5
[1] "Joomla!"                       "edición, traducción o versión"
[3] "entrada/salida"                "obra/composición musical"     
[5] "Xbox Series X|S"              


In [None]:
%%R
all_rare_content_indx= rep(F, nrow(rare_content_indx))
for (c in 1:ncol(rare_content_indx)) {
    all_rare_content_indx = all_rare_content_indx |  rare_content_indx[,c]
}
print(sum(all_rare_content_indx & data_filt_4$relation_type == "random"))
print(sum(all_rare_content_indx))

[1] 9046
[1] 9205


In [None]:
%%R
data_filt_5 = data_filt_4[!all_rare_content_indx,]
print(nrow(data_filt_5))

[1] 378270


In [None]:
%%R
total_rel_5 = sapply(levels(data_filt_5$relation_type), function (r) sum(data_filt_5$relation == r))
print(total_rel_5)
print(sum(total_rel_5))

antonym for holonym for hyponym for meronym for      random 
        296        1727        5328         952      369967 
[1] 378270


###**Create hyperonyms**

In [None]:
%%R
data_filt_hyper = data_filt_5

hyponyms_filter = data_filt_hyper$relation_type == "hyponym for"
hyponyms_indx = which(hyponyms_filter == T)
length(hyponyms_indx)

[1] 5328


In [None]:
%%R
data_filt_hyper$relation_type = as.character(data_filt_hyper$relation_type)
set.seed(32)
hyper = sample(1:length(hyponyms_indx), 0.5*length(hyponyms_indx))
hyper_indx = hyponyms_indx[hyper]
head(hyper_indx)
for (i in hyper_indx) {
    temp = data_filt_hyper[i,1]
    data_filt_hyper[i,1] = data_filt_hyper[i,2]
    data_filt_hyper[i,2] = temp
    data_filt_hyper[i,3] = "hyperonym for"
    data_filt_hyper[i,4] = paste(data_filt_hyper[i,4], "inv", sep = "_")
    for (j in seq(5,ncol(data_filt_hyper), 2)) {
        temp = data_filt_hyper[i,j]
        data_filt_hyper[i,j] = data_filt_hyper[i,j+1]
        data_filt_hyper[i,j+1] = temp
    }
}

data_filt_hyper$relation_type = as.factor(data_filt_hyper$relation_type)

In [None]:
%%R
print(sum(data_filt_hyper$relation_type == "hyperonym for"))
print(levels(data_filt_hyper$relation_type))

[1] 2664
[1] "antonym for"   "holonym for"   "hyperonym for" "hyponym for"  
[5] "meronym for"   "random"       


###**Duplicates**

In [None]:
%%R
d = data_filt_hyper[duplicated(data_filt_hyper[,1:3]),]
total_rel_dup = sapply(levels(d$relation_type), function (r) sum(d$relation == r))
print(total_rel_dup)
print(sum(total_rel_dup))

  antonym for   holonym for hyperonym for   hyponym for   meronym for 
            0            44             0             0             7 
       random 
            0 
[1] 51


In [None]:
%%R
data_filt_hyper = data_filt_hyper[!duplicated(data_filt_hyper[,1:3]),]

###**Remove random relations up to 70%**

In [None]:
%%R
total_rel_hyper = sapply(levels(data_filt_hyper$relation_type), function (r) sum(data_filt_hyper$relation_type == r))
stats_dataset_hyper = as.data.frame(total_rel_hyper)
stats_dataset_hyper$percentage = stats_dataset_hyper$total_rel/sum(total_rel_hyper)

In [None]:
# move R variable to python
%R -o stats_dataset_hyper
stats_dataset_hyper

Unnamed: 0,total_rel_hyper,percentage
antonym for,296,0.000783
holonym for,1683,0.00445
hyperonym for,2664,0.007044
hyponym for,2664,0.007044
meronym for,945,0.002499
random,369967,0.978182


In [None]:
%%R
# max percentage of random relations
MAX_PERC_RANDOM = 0.7

number_random_remove = (stats_dataset_hyper['random',1] - MAX_PERC_RANDOM*sum(stats_dataset_hyper$total_rel_hyper))/(1-MAX_PERC_RANDOM)


indx_random = which(data_filt_hyper$relation_type == "random")
indx_remove_random = sample(indx_random, number_random_remove)

data_filt_final = data_filt_hyper[-indx_remove_random,]

total_rel_final = sapply(levels(data_filt_final$relation_type), function (r) sum(data_filt_final$relation == r))
stats_dataset_final = as.data.frame(total_rel_final)
stats_dataset_final$percentage = stats_dataset_final$total_rel_final/sum(total_rel_final)
print(total_rel_final)
print(sum(total_rel_final))

  antonym for   holonym for hyperonym for   hyponym for   meronym for 
          296          1683          2664          2664           945 
       random 
        19255 
[1] 27507


In [None]:
%R -o stats_dataset_final

stats_dataset_final

Unnamed: 0,total_rel_final,percentage
antonym for,296,0.010761
holonym for,1683,0.061184
hyperonym for,2664,0.096848
hyponym for,2664,0.096848
meronym for,945,0.034355
random,19255,0.700004


In [None]:
%%R
data_filt_final_not_random = data_filt_final[data_filt_final$relation_type != "random",]

print(length(unique(c(data_filt_final_not_random$subject, data_filt_final_not_random$object))))
print(length(unique(c(data_filt_final$subject, data_filt_final$object))))

t_not_random = sort(table(c(data_filt_final_not_random$subject, data_filt_final_not_random$object)),decreasing = TRUE)
t_total = sort(table(c(data_filt_final$subject, data_filt_final$object)),decreasing = TRUE)

print(head(t_not_random,20))
print(head(t_total, 20))
print(length(names(t_not_random)))
print(summary(as.integer(t_not_random)))
print(summary(as.integer(t_total)))

[1] 5778
[1] 7233

     Q623      Q629   Q107715      Q556      Q420      Q627   Q214609   Q212434 
      119        83        79        70        52        46        41        39 
     Q283  Q1914636     Q8162     Q2095    Q43229     Q7239    Q11426  Q1190554 
       38        34        33        32        30        29        28        28 
 Q1441305 Q16334295     Q5891    Q41176 
       27        27        27        26 

    Q623     Q629     Q556  Q107715     Q420     Q283  Q212434     Q627 
     128       92       84       79       67       49       48       46 
 Q214609   Q41176    Q8162  Q746549   Q11002 Q1914636  Q618779 Q1190554 
      44       38       38       37       36       36       35       34 
  Q19557 Q3294789  Q386724 Q1656682 
      34       34       34       33 
[1] 5778
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   2.000   2.856   3.000 119.000 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   3.000   6.000   7.606  11.000 128.000 


In [None]:
!mkdir -p /content/drive/MyDrive/complete_dataset/

In [None]:
%%R
write.csv(data_filt_final, paste("/content/drive/MyDrive/complete_dataset/dataset_P_L", TOTAL_LANGUAGES, ".csv", sep=""), row.names=F)