# Exploring SpaCy across Different Languages

In [None]:
import pandas as pd
import spacy
from numpy import log

In [None]:
!python3 -m spacy download ja_core_news_sm --quiet
!python3 -m spacy download es_core_news_sm --quiet
!python3 -m spacy download en_core_web_sm --quiet

[K     |████████████████████████████████| 12.0 MB 4.4 MB/s 
[K     |████████████████████████████████| 2.2 MB 4.3 MB/s 
[?25h  Building wheel for sudachidict-core (setup.py) ... [?25l[?25hdone
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ja_core_news_sm')
[K     |████████████████████████████████| 12.9 MB 4.2 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[K     |████████████████████████████████| 12.8 MB 4.6 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
nlp_en=spacy.load('en_core_web_sm')
nlp_sp=spacy.load('es_core_news_sm')
nlp_jp=spacy.load('ja_core_news_sm')

In [None]:
#use Google translate
english_sent='The cat is on the mat.'
spanish_sent='el gato está en la colchoneta.'
japanese_sent='猫はマットの上にいる.'

In [None]:
en_proc=nlp_en(english_sent)
sp_proc=nlp_sp(spanish_sent)
jp_proc=nlp_jp(japanese_sent)

In [None]:
for token in en_proc:
  print('English',token.orth_,token.lemma_,token.pos_,token.head.orth_,token.dep_)
print('-'*20)
for token in sp_proc:
  print('Spanish',token.orth_,token.lemma_,token.pos_,token.head.orth_,token.dep_)
print('-'*20)
for token in jp_proc:
  print('Japanese',token.orth_,token.lemma_,token.pos_,token.head.orth_,token.dep_)

English The the DET cat det
English cat cat NOUN is nsubj
English is be AUX is ROOT
English on on ADP is prep
English the the DET mat det
English mat mat NOUN on pobj
English . . PUNCT is punct
--------------------
Spanish el el DET gato det
Spanish gato gato NOUN está nsubj
Spanish está estar VERB está ROOT
Spanish en en ADP colchoneta case
Spanish la el DET colchoneta det
Spanish colchoneta colchoneta NOUN está obl
Spanish . . PUNCT está punct
--------------------
Japanese 猫 猫 NOUN いる nsubj
Japanese は は ADP 猫 case
Japanese マット マット NOUN 上 nmod
Japanese の の ADP マット case
Japanese 上 上 NOUN いる obl
Japanese に に ADP 上 case
Japanese いる いる VERB いる ROOT
Japanese . . PUNCT いる punct


In [None]:
import nltk
nltk.download(['knbc','cess_esp','brown'],quiet=True)
from nltk.corpus import knbc,cess_esp,brown

In [None]:
en_sent=brown.sents()[1000]
jp_sent=knbc.sents()[1000]
es_sent=cess_esp.sents()[1000]

en=' '.join(en_sent)
jp=' '.join(jp_sent)
es=' '.join(es_sent)

en_proc=nlp_en(en)
jp_proc=nlp_jp(jp)
es_proc=nlp_sp(es)

In [None]:
for token in en_proc:
  print('English',token.orth_,token.lemma_,token.pos_)
print('-'*20)
for token in es_proc:
  print('Spanish',token.orth_,token.lemma_,token.pos_)
print('-'*20)
for token in jp_proc:
  print('Japanese',token.orth_,token.lemma_,token.pos_)

English 800 800 NUM
English in in ADP
English Southern Southern PROPN
English New New PROPN
English England England PROPN
English , , PUNCT
English we we PRON
English have have VERB
English 60 60 NUM
English ; ; PUNCT
English ; ; PUNCT
--------------------
Spanish En en ADP
Spanish el el DET
Spanish mismo mismo DET
Spanish acto acto NOUN
Spanish electoral electoral ADJ
Spanish , , PUNCT
Spanish Maragall Maragall PROPN
Spanish ha haber AUX
Spanish expresado expresar VERB
Spanish la el DET
Spanish preocupación preocupación NOUN
Spanish de de ADP
Spanish su su DET
Spanish partido partido NOUN
Spanish por por ADP
Spanish la el DET
Spanish " " PUNCT
Spanish hemorragia hemorragia NOUN
Spanish " " PUNCT
Spanish de de ADP
Spanish actividad actividad NOUN
Spanish económica económico ADJ
Spanish que que PRON
Spanish se él PRON
Spanish traslada trasladar VERB
Spanish desde desde ADP
Spanish Cataluña Cataluña PROPN
Spanish a a ADP
Spanish Madrid Madrid PROPN
Spanish y y CCONJ
Spanish * * PUNCT
Spa

# Greenbergian Universals


In [None]:
from collections import defaultdict

def greenberg(sent_corpus,model):
  #Count head-final structures
  fd=defaultdict(int)
  #Count all structures
  count_fd=defaultdict(int)
  for sent in sent_corpus:
    proc=model(' '.join(sent))
    #For each preprocessed word...
    for w in proc:
      #if it has syntactic children...
      if len(list(w.children))>0:
        #for each child...
        for child in w.children:
          #count its dependency
          count_fd[child.head.dep_]+=1
          #also count if it's head-final
          if child.i<w.i:
            fd[child.head.dep_]+=1
  output_df=defaultdict()
  for key in set(fd.keys()).union(set(count_fd.keys())):
    output_df[key]=fd[key]/count_fd[key]
  return output_df

In [None]:
en_orders=greenberg(brown.sents()[:1000],nlp_en)
print(en_orders)
jp_orders=greenberg(knbc.sents()[:1000],nlp_jp)
print(jp_orders)
es_orders=greenberg(cess_esp.sents()[:1000],nlp_sp)
print(es_orders)

defaultdict(None, {'xcomp': 0.4050632911392405, 'poss': 0.49382716049382713, 'csubj': 0.25, 'oprd': 0.7, 'appos': 0.5205992509363296, 'nsubjpass': 0.7237569060773481, 'nsubj': 0.7394034536891679, 'csubjpass': 1.0, 'npadvmod': 0.7407407407407407, 'acl': 0.3309178743961353, 'advcl': 0.563249001331558, 'ROOT': 0.44150624074465833, 'conj': 0.44051130776794495, 'ccomp': 0.5816956765861875, 'auxpass': 1.0, 'pcomp': 0.28019323671497587, 'aux': 0.0, 'pobj': 0.6990062761506276, 'prep': 0.01273100616016427, 'nmod': 0.4095238095238095, 'dobj': 0.6649968691296181, 'compound': 1.0, 'amod': 0.4971751412429379, 'mark': 0.2, 'cc': 0.8333333333333334, 'parataxis': 0.639344262295082, 'agent': 0.011111111111111112, 'advmod': 0.5348837209302325, 'relcl': 0.5781990521327014, 'attr': 0.592797783933518, 'acomp': 0.35294117647058826, 'preconj': 0.0, 'punct': 0.0, 'dative': 0.23076923076923078, 'nummod': 0.875968992248062, 'dep': 0.6363636363636364})
defaultdict(None, {'csubj': 0.37719298245614036, 'nsubj': 0.

In [None]:
print(jp_orders['nsubj'])
print(en_orders['nsubj'])
print(es_orders['nsubj'])
print()
print(jp_orders['obj'])
print(en_orders['dobj'])
print(es_orders['obj'])
print()
print(jp_orders['amod'])
print(en_orders['amod'])
print(es_orders['amod'])


0.4307342922028766
0.7394034536891679
0.4746734628862695

0.4713584288052373
0.6649968691296181
0.5438370846730975

1.0
0.4971751412429379
0.33699059561128525
