In [1]:
import spacy
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob
import nltk
from nltk.corpus import wordnet as wn

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')

nlp = spacy.load("fr_core_news_sm")
stemmer = SnowballStemmer("french")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abdoulayebalde/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abdoulayebalde/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
def process_clause_version_two(clause):
    doc = nlp(clause)
    filtered_tokens = [token for token in doc if
                       token.pos_ in {"NOUN", "VERB", "ADJ", "ADV", "NUM", "PRON"} and token.pos_ != "DET"]

    subject, verb, obj, time, location, others = [], [], [], [], [], []

    for token in filtered_tokens:
        if "subj" in token.dep_:
            subject.append(token.lemma_)
        elif "obj" in token.dep_:
            obj.append(token.lemma_)
        elif token.pos_ == "VERB":
            morph = token.morph.get("Tense")
            if morph:
                time.append(morph[0].lower())
            verb.append(token.lemma_)
        elif token.dep_ in {"advmod", "npadvmod"} and token.pos_ == "ADV":
            time.append(token.lemma_)
        elif token.dep_ == "obl" and token.ent_type_ == "LOC":
            location.append(token.lemma_)
        else:
            others.append(token.lemma_)
    
    print("subject: ", subject)
    print("verb: ", verb)
    print("object: ", obj)
    print("time: ", time)
    print("location: ", location)
    print("Others: ", others)

    gloss_sequence = time + location + subject + verb + obj + others

    return " ".join(gloss_sequence)


def text_to_gloss(text):
    clauses = text.split(" et ")
    processed_clauses = [process_clause_version_two(clause) for clause in clauses]
    return " et ".join(processed_clauses)

In [9]:
text = "Bonjour, ce matin je vais à l'école et le soir je suis au terrain de Basketball."
print(text)
gloss_text = text_to_gloss(text)
gloss_text

Bonjour, ce matin je vais à l'école et le soir je suis au terrain de Basketball.
subject:  ['matin', 'je']
verb:  ['aller']
object:  []
time:  ['pres']
location:  []
Others:  ['école']
subject:  ['soir', 'je']
verb:  []
object:  []
time:  []
location:  []
Others:  ['terrain']


'pres matin je aller école et soir je terrain'

In [None]:
def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word, lang='fra'):
        for lemma in syn.lemmas('fra'):
            synonyms.add(lemma.name())
    return synonyms

In [11]:
for word in gloss_text.split(" "):
    print(word + " synonyme " + str(get_synonyms(word)))

pres synonyme set()
matin synonyme {'matin', 'aube', 'aurore', 'bonjour', 'avant-midi', 'morrow', 'lendemain', 'matinée', 'première_lumière'}
je synonyme {'je', 'Moi', 'iode'}
aller synonyme {'tenir_le_coup', 'mener', 'sortir_avec', 'cohabiter', 'continuer', 'repousser', 'vouloir', 'effectuer', 'courir', 'bien', 'Adam', 'accouchement', 'durer', 'ecstasy', 'obtenir', 'en_forme', 'conduire', 'chevaucher', 'bélier', 'rouler', 'prendre', 'distribuer', 'venir', 'voyage', 'remorquer', 'go', 'écouler', 'appartenir', 'est-ce_que', 'marcher', 'travailler', 'viser', 'soutenir', 'reculer', 'asseoir', 'arriver', 'tour', 'procéder', 'tendre', 'piloter', 'force', 'devenir', 'bien_aller', 'accommoder', 'hier', 'disparaître', 'partir', 'déplacer', 'moteur', 'appartenir_à', 'représenter', 'contraindre', 'aboutir', 'permettre', 'tomber_en_panne', 'se_marier', 'fin', 'voyager', 'dé_à_jouer', 'destiner', 'affecter', 'chasser', 'traverser', 'négliger', 'fonctionner', 'fusil', 'faillir', 'évaluer', 'aller',

In [13]:
stemmed_gloss = ' '.join([stemmer.stem(word) for word in gloss_text.split()])
print("gloss text: ", gloss_text)
print("Stemming:", stemmed_gloss)

gloss text:  pres matin je aller école et soir je terrain
Stemming: pre matin je aller écol et soir je terrain
