# Corpus arborés et parsing - 2024/25

In [None]:
# Checking the environment
!echo $VIRTUAL_ENV
# and...
!which python
!which pip

In [None]:
!pip install seaborn

## Partie 2

Et si je veux tagger et analyser syntaxiquement un texte ?

In [None]:
# Une segmentation très naive
with open("conte.txt", encoding="utf-8") as file:
    text = file.read()
    
#sentences = [sent.strip() for sent in text.split(".")]
sentences = []
for sent in text.split('.'):
    sent_stripped = sent.strip()
    if sent_stripped:
        sentences.append(sent_stripped)
sentences[:3]

In [None]:
# Encore mieux avec spaCy
import spacy
from spacy import displacy
nlp = spacy.load("fr_core_news_sm")
# Je parse tout le text au même temps
doc = nlp(text)

In [None]:
list(doc.sents)

In [None]:
for sent in doc.sents:
    print(f"la racine est {sent.root.text}")
    for token in sent:
        print(token.text, token.pos_, token.morph, token.morph, token.lemma_, token.dep_)
    
    # pour trouver les enfants
    for child in token.children:
        print(token, child.text)

# upos = [(token.text, token.pos_) for token in doc]
# print(upos)

In [None]:
#dir(doc)

In [None]:
for token in list(doc.sents)[0][:3]:
    print(f"token : {token.text}\t deprel : {token.dep_}\t token_head : {token.head.text}")

In [None]:
# Trouver les verbes avec un sujet    
verbs = list()
for possible_subject in doc:
    if possible_subject.dep_ == "nsubj" and possible_subject.head.pos_ == "VERB":
        verbs.append(possible_subject.head)
print(verbs)

In [None]:
displacy.render(list(doc.sents)[0], style='dep')

Exos

In [None]:
# Analysez avec SpaCy les phrases du conte Le Joueur de flûte de Hamelin. 
# a. Combient y a-t-il de mots ?
# b. Quelle est la phrase la plus longue ?
# c. Quelle est la longueur moyenne de phrases?
# d. Quel est le mot le plus longue ?
# e. Combient y a-t-il de verbes ?
# f. Plottez avec un barplot les dix mots les plus fréquents.

In [None]:
sentences_len = []
tokens = []
verbs = []

for sent in sentences:
    # Je parse une phrase à la fois
    doc = nlp(sent)
    sent_len = len(doc)
    sentences_len.append(sent_len)
    for token in doc:
        tokens.append(token.text)
        if token.pos_ == "VERB":
            verbs.append(token.text)

In [None]:
# number of tokens
len(tokens)

In [None]:
# find the index of the longest sentence to get the right sentence string
res = sentences_len.index(max(sentences_len))
sentences[res]

In [None]:
# average token/sentence
len(tokens)/len(sentences_len) # /len(doc)

In [None]:
# One (bad) way to get te longest token
max_word_len = 0
for w in words:
    if len(w) > max_word_len:
        max_word_len = len(w)
        res = w
print(res)

In [None]:
# A better way
max_word_len = 0
res = []
for w in words:
    if len(w) > max_word_len:
        max_word_len = len(w)
        res = [w]
    elif len(w) == max_word_len:
        res.append(w)
print(res)

In [None]:
# Number of verbs
len(verbs)

In [None]:
# Plot the 10 most frequent tokens
from collections import Counter
import seaborn as sns

counter = Counter(tokens)
res = sorted(counter.items(), key= lambda x: x[1], reverse=True)
res[:10]

freq = []
labels = []
for tk in res[:10]:
    freq.append(tk[0])
    labels.append(tk[1])

sns.barplot(x=freq, y=labels)

In [None]:
# Encore à vous...
# a. Quelle est la distribution de dépendances à gauche et à droite? Check spaCy API https://spacy.io/api/token
# b. Quelles sont les dépendances qui, dans au moins 90 % des cas, vont à gauche ? 
# c. Quel est le nombre maximal de dépendants pour un nœud ?
# d. Y a-t-il des dépendances non projectives ?

In [None]:
# a, b, and c

doc = nlp(text)
max_children = 0

unique_deps = set()
for token in doc:
    unique_deps.add(token.dep_)
#unique_deps = set([token.dep_ for token in doc])

deps = {}
for dep in unique_deps:
    deps[dep] = { 'left': 0, 'right': 0 }
#deps = {dep : { 'left': 0, 'right': 0 } for dep in unique_deps }
# you can also do this with a defaultdict

for sent in doc.sents:
    for token in sent:
        n_children = 0
        for child in token.children:
            # for question c
            n_children += 1

            # the token is the head
            if token.idx < child.idx:
                deps[child.dep_]['right'] += 1
            elif token.idx > child.idx:
                deps[child.dep_]['left'] += 1

        if n_children > max_children:
            max_children = n_children

In [None]:
# a
n_left = sum(v['left'] for v in deps.values())
n_right = sum(v['right'] for v in deps.values())
total = n_left + n_right

print(n_left / total, n_right / total)

In [None]:
# b
mostly_left = []
for dep, values in deps.items():
    left = values['left']
    right = values['right']
    total = left + right
    if left and left/total >= 0.9:
        mostly_left.append(dep)
print(mostly_left)

In [None]:
# c
max_children
# In one line, but harder to understand
# max(len(list(token.children)) for sent in doc.sents for token in sent)