In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Configure how graphs will show up in this notebook
%matplotlib inline
seaborn.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [None]:
import spacy
!pip install ro-legal-fl

Collecting ro-legal-fl
  Downloading ro_legal_fl-3.6.1-12-py3-none-any.whl (142.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.7/142.7 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy<3.7.0,>=3.6.1 (from ro-legal-fl)
  Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.2.0,>=8.1.8 (from spacy<3.7.0,>=3.6.1->ro-legal-fl)
  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (919 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
Collecting pathy>=0.10.0 (from spacy<3.7.0,>=3.6.1->ro-legal-fl)
  Downloading pathy-0.11.0-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[

In [None]:
nlp = spacy.load("ro_legal_fl")



In [None]:
def load_embeddings(filename):
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)

    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

embeddings = load_embeddings('/content/drive/MyDrive/AI RASIST/wiki.ro.vec')
embeddings.shape

(200000, 300)

In [None]:
def load_lexicon(filename):
    lexicon = []
    with open(filename, encoding='utf-8') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('/content/drive/MyDrive/AI RASIST/cuv-pozitive-v2.txt')
neg_words = load_lexicon('/content/drive/MyDrive/AI RASIST/cuv-neg-v3.txt')

In [None]:
pos_and_neg_words = pos_words + neg_words
print(pos_and_neg_words)

['abil', 'abilita', 'abilitare', 'abilitate', 'aborda', 'abordabil', 'abracadabra', 'absolut', 'abundă', 'abundent', 'abundență', 'acasă', 'acatist', 'accede', 'accedere', 'accelerare', 'accept', 'acceptare', 'accesibilitate', 'aclamă', 'aclamare', 'acord', 'acreditare', 'activ', 'activant', 'activare', 'activator', 'acționa', 'acțiune', 'acumulare', 'acumulativ', 'acuratețe', 'adaos', 'adaptabil', 'adaptabilitate', 'adaptare', 'adaptiv', 'adăugare', 'aderă', 'aderare', 'adevăr', 'adevăr absolut', 'adevărat', 'adeverit', 'admiră', 'admirabil', 'admirare', 'admirativ', 'admirator', 'admirație', 'admisibil', 'adoptă', 'adoră', 'adorabil', 'adorare', 'adună', 'aer', 'aerospațial', 'afabil', 'afabilitate', 'afectiv', 'afectivitate', 'afectuos', 'afecțiune', 'afinitate', 'afirmare', 'afirmativ', 'afirmație', 'agapă', 'ager', 'agerime', 'agheasmator', 'agheasmă', 'aghesmuire', 'aghesmuit', 'a giorno', 'agilitate', 'agoniseală', 'agonisire', 'agreabil', 'agrement', 'agrementa', 'agrementare',

In [None]:
labels_string = " ".join(embeddings.index)
print(labels_string)



In [None]:
pos_and_neg_words_string = ""
for word in pos_and_neg_words:
  pos_and_neg_words_string = pos_and_neg_words_string + ' ' + word
print(pos_and_neg_words_string)

 abil abilita abilitare abilitate aborda abordabil abracadabra absolut abundă abundent abundență acasă acatist accede accedere accelerare accept acceptare accesibilitate aclamă aclamare acord acreditare activ activant activare activator acționa acțiune acumulare acumulativ acuratețe adaos adaptabil adaptabilitate adaptare adaptiv adăugare aderă aderare adevăr adevăr absolut adevărat adeverit admiră admirabil admirare admirativ admirator admirație admisibil adoptă adoră adorabil adorare adună aer aerospațial afabil afabilitate afectiv afectivitate afectuos afecțiune afinitate afirmare afirmativ afirmație agapă ager agerime agheasmator agheasmă aghesmuire aghesmuit a giorno agilitate agoniseală agonisire agreabil agrement agrementa agrementare aievea ajun ajutător ajutor aleluia alegere alfa alianță alimentare alinare alină alinător alint alintă alintare alintat alintător altar alteță altruism altruist amabil amabilitate ambiție ambițios ameliorare amiabil amic amical amiciție amin amint

In [None]:
import random
words = labels_string.split()

# Specify the number of words you want
num_words = 140000
print(num_words)

# Randomly choose the specified number of words
random_words = random.sample(words, num_words)

# Join the selected words back into a string
truncated_labels_string = ' '.join(random_words)

print(len(truncated_labels_string))

140000
1226660


In [None]:
rolegal_nlp = list(nlp.pipe(random_words))

  matches = self.matcher(doc, allow_missing=True, as_spans=False)


In [None]:
print(rolegal_nlp)



In [None]:
rolegal_embeddings = [token.vector for token in rolegal_nlp]

In [None]:
labels = []
rows = []
for token, embedding in zip(rolegal_nlp, rolegal_embeddings):
    labels.append(token.text)
    values = np.array([float(x) for x in embedding], 'f')
    rows.append(values)
arr = np.vstack(rows)
legal_final_emb = pd.DataFrame(arr, index=labels, dtype='f')

In [None]:
missing_pos_words = set(pos_words) - set(legal_final_emb.index)
missing_neg_words = set(neg_words) - set(legal_final_emb.index)

print("Missing positive words:", missing_pos_words)
print("Missing negative words:", missing_neg_words)

# Update lexicons to remove missing words
pos_words = list(set(pos_words) - missing_pos_words)
neg_words = list(set(neg_words) - missing_neg_words)

# Now try extracting vectors again
pos_vectors = legal_final_emb.loc[pos_words].dropna()
neg_vectors = legal_final_emb.loc[neg_words].dropna()

Missing positive words: {'amorezat', 'maturitate', 'activant', 's-a făcut', 'contemplativitate', 'făurire', 'apă vie', 'a răzbate', 'a fascina', 'pact', 'a pondera', 'a idealiza', 'stilat', 'vioi/vioaie', 'agrementare', 'expert', 'securitate', 'optimism', 'admisibil', 'a inova', 'inovație', 'cunoscut', 'atenție', 'a deschide ochii', 'bonificație', 'imaginație', 'înfrățire', 'favorabil', 'fluid', 'a se bucura', 'vrednicie', 'a dibui', 'inefabil', 'a creea-creez', 'a purifica', 'valoare', 'ieșire la lumină', 'proaspăt', 'antrenant', 'a se inmuia inima', 'afectivitate', 'misterioso', 'infailibilitate', 'falnic', 'gâdilare', 'abordabil', 'a răspunde', 'pasiune', 'forme ale conștiinței', 'roditor', 'imun', 'hristic', 'a hotărî', 'a fi cu haz', 'omenit', 'ospeție', 'rânduiala', 'precizie', 'a-și da seama', 'viu/vie', 'harnic', 'proactiv', 'a elabora', 'a progresa', 'bunăvoie', 'extrasenzorial', 'asortare', 'coeficient de inteligență emoțională', 'imaculat', 'contemporan', 'amiciție', 'income

In [None]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [None]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

In [None]:
model = SGDClassifier(loss='log', random_state=0, max_iter=100)
model.fit(train_vectors, train_targets)



In [None]:
accuracy_score(model.predict(test_vectors), test_targets)

0.7044334975369458

In [None]:
def vecs_to_sentiment(vecs):
    predictions = model.predict_log_proba(vecs)
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = legal_final_emb.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

words_to_sentiment(test_labels).iloc[:20]

  return np.log(self.predict_proba(X))


Unnamed: 0,sentiment
dornic,31.153304
doamnelor,-2.275677
fundamentalist,14.75628
duios,-14.603692
putere,-12.199221
instituție,inf
diform,-11.348013
uimitor,-9.694821
evlavie,-6.277888
nap,13.935428


In [None]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.


def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

In [None]:
#rolegal
print(text_to_sentiment("roman"))
print(text_to_sentiment("rrom"))
print(text_to_sentiment("american"))
print(text_to_sentiment("homosexual"))
print(text_to_sentiment("bunătate"))
print(text_to_sentiment("răutate"))



NameError: name 'text_to_sentiment' is not defined

In [None]:
########################################################## Anything under was for testing and/or previous versions

In [None]:
def load_embeddings(filename):
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)

    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

embeddings = load_embeddings('/content/drive/MyDrive/AI RASIST/wiki.ro.vec')
embeddings.shape

(200000, 300)

In [None]:
def load_lexicon(filename):
    lexicon = []
    with open(filename, encoding='utf-8') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('/content/drive/MyDrive/AI RASIST/cuv-pozitive-v2.txt')
neg_words = load_lexicon('/content/drive/MyDrive/AI RASIST/cuv-neg-v3.txt')

In [None]:
print(len(pos_words))
print(len(neg_words))

2599
4175


In [None]:
missing_pos_words = set(pos_words) - set(embeddings.index)
missing_neg_words = set(neg_words) - set(embeddings.index)

print("Missing positive words:", missing_pos_words)
print("Missing negative words:", missing_neg_words)

# Update lexicons to remove missing words
pos_words = list(set(pos_words) - missing_pos_words)
neg_words = list(set(neg_words) - missing_neg_words)

# Now try extracting vectors again
pos_vectors = embeddings.loc[pos_words].dropna()
neg_vectors = embeddings.loc[neg_words].dropna()

Missing positive words: {'a fi om de inimă', 'a descifra', 'inefabilitate', 'a prețui', 'a fericita', 'frate de cruce', 'a demonstra', 'a da răspuns', 'activant', 'a preconiza', 's-a făcut', 'frate bun', 'asentiment', 'a ctitori', 'a influența', 'a ilumina', 'contemplativitate', 'făurire', 'apă vie', 'a recomanda', 'a ajuta', 'a salva', 'a răzbate', 'a descrie', 'a revela', 'a fascina', 'izbăvit', 'a iriza', 'a pondera', 'a zări', 'semeție', 'cognoscibilitate', 'centri energetici', 'bonjur', 'a civiliza', 'a se dedica', 'a idealiza', 'stilat', 'vioi/vioaie', 'a efectua', 'extaziere', 'concretețe', 'a pomeni', 'a plăsmui', 'hrănitor', 'agrementare', 'a îndrăzni', 'a elogia', 'bravadă', 'civilizare', 'autoevalua', 'preabun', 'a inova', 'comprehensiune', 'feblețe', 'a omeni', 'a deschide ochii', 'bonificație', 'cu dragă inimă', 'fixabil', 'a preveni', 'nou/nouă', 'a făptui', 'a vibra', 'luminiscență', 'a se bucura', 'ingenuitate', 'a dibui', 'intuire', 'a creea-creez', 'extravertit', 'a e

In [None]:
print(len(pos_words))
print(len(neg_words))
print(len(missing_pos_words))
print(len(missing_neg_words))

1656
1269
919
1529


In [None]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [None]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

In [None]:
model = SGDClassifier(loss='log', random_state=0, max_iter=100)
model.fit(train_vectors, train_targets)



In [None]:
accuracy_score(model.predict(test_vectors), test_targets)

0.8122866894197952

In [None]:
def vecs_to_sentiment(vecs):
    predictions = model.predict_log_proba(vecs)
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

words_to_sentiment(test_labels).iloc[:20]

Unnamed: 0,sentiment
catâr,-2.100411
exact,-0.027541
melanj,-0.066334
conformitate,3.323714
asimilare,0.219923
inspirație,2.808821
jaf,-4.901271
apatie,-1.41277
prater,-1.755428
îndreptare,2.189669


In [None]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.


def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

In [None]:
print(text_to_sentiment("Răutate"))
print(text_to_sentiment("Bunătate"))
print(text_to_sentiment("Român"))
print(text_to_sentiment("Ungur, maghiar"))
print(text_to_sentiment("alb"))
print(text_to_sentiment("negru"))
print(text_to_sentiment("homosexual, gay"))
print(text_to_sentiment("creștin"))


-1.3522847494418098
3.1662093404740865
-0.7113338278880081
-2.6690478566577656
-0.8897949533584992
-1.3289044641545882
-2.173922508685937
1.7109134843256102


In [None]:
import spacy
from spacy.lang.ro.examples import sentences

nlp = spacy.load("ro_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

OSError: [E050] Can't find model 'ro_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
!pip install --upgrade spacy

Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.6.1
    Uninstalling spacy-3.6.1:
      Successfully uninstalled spacy-3.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ro-legal-fl 3.6.1 requires spacy<3.7.0,>=3.6.1, but you have spacy 3.7.2 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.7.2


In [None]:
import spacy

In [None]:
!pip install ro-legal-fl

Collecting spacy<3.7.0,>=3.6.1 (from ro-legal-fl)
  Using cached spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.2
    Uninstalling spacy-3.7.2:
      Successfully uninstalled spacy-3.7.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.7.1 requires spacy<3.8.0,>=3.7.2, but you have spacy 3.6.1 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.6.1


In [None]:
nlp = spacy.load("ro_legal_fl")



In [None]:
doc = nlp("Titlul III din LEGEA nr. 255 din 19 iulie 2013, publicată în MONITORUL OFICIAL")
# legal entity identification
for entity in doc.ents:
    print('entity: ', entity, '; entity type: ', entity.label_)

entity:  III ; entity type:  NUMERIC
entity:  LEGEA nr. 255 din 19 iulie 2013 ; entity type:  LEGAL
entity:  MONITORUL OFICIAL ; entity type:  ORG


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


In [None]:
# floret n-gram embeddings robust to typos
print(nlp('achizit1e public@').similarity(nlp('achiziții publice')))
# 0.7393895566928835
print(nlp('achizitii publice').similarity(nlp('achiziții publice')))
# 0.8996480808279399

0.7393895566928835
0.8996480808279399


In [None]:
pos_and_neg_words = pos_words + neg_words
print(pos_and_neg_words)

['amorezat', 'maturitate', 'concepere', 'intelect', 'inaugurare', 'recunoscător', 'inițiat', 'breaz', 'slăvit', 'vioiciune', 'maxim', 'prietenie', 'cumsecade', 'pact', 'iscusit', 'benedictin', 'aievea', 'întrunire', 'conștientizare', 'chivot', 'expert', 'angajament', 'consolidare', 'optimism', 'securitate', 'admisibil', 'virtute', 'exclamare', 'totdeauna', 'inovație', 'consistent', 'liniște', 'cunoscut', 'atenție', 'imaginație', 'persistent', 'conform', 'înfrățire', 'favorabil', 'fluid', 'glumă', 'caritabil', 'vrednicie', 'inefabil', 'valoare', 'jubileu', 'conștient', 'proaspăt', 'antrenant', 'franc', 'afectivitate', 'falnic', 'clemență', 'atotcuprinzător', 'bucurie', 'pasiune', 'superior', 'roditor', 'alintă', 'inepuizabil', 'reactivare', 'imun', 'coaliție', 'rânduiala', 'adevărat', 'autocunoaștere', 'sacrament', 'precizie', 'viitor', 'voluntariat', 'fenomenal', 'arhimandrit', 'accelerare', 'harnic', 'devotat', 'proactiv', 'diversitate', 'atemporal', 'titan', 'bunăvoie', 'liber', 'inf

In [None]:
test_mbeddings = [token.vector for token in doc]

In [None]:
labels_string = " ".join(embeddings.index)

# Print the resulting string
print(labels_string)



In [None]:
for token, embedding in zip(doc, test_mbeddings):
    print(f"Token: {token.text}, Embedding: {embedding}")

Token: Titlul, Embedding: [-1.22446287e+00 -2.89953208e+00 -1.36351299e+00 -4.44200546e-01
 -3.27520579e-01 -8.30055594e-01 -5.61764419e-01  1.04155302e+00
  1.47506261e+00 -7.36825943e-01  2.25400186e+00  4.80031431e-01
 -2.79969001e+00  3.41492796e+00  2.46144748e+00 -1.26779985e+00
 -6.20783508e-01  6.01397514e-01 -7.87975967e-01  1.48347437e+00
 -5.85020423e-01  5.44571698e-01 -1.47744501e+00 -4.44299984e+00
 -6.25537932e-01  4.36857253e-01  4.36670899e-01  7.69562006e-01
 -2.74144268e+00  4.06983495e-01 -9.94978011e-01  3.91280025e-01
  1.21118855e+00 -1.59129310e+00 -7.37728536e-01 -5.25241137e-01
 -3.19222641e+00 -1.29456711e+00 -2.30608392e+00  2.02312422e+00
  1.29616714e+00  2.34663773e+00  5.56189001e-01 -3.54741120e+00
 -8.62712502e-01 -1.53989077e+00  7.21566021e-01 -3.51907492e-01
 -7.06052959e-01 -2.33175188e-01  7.32591227e-02 -9.38119411e-01
 -1.53249010e-01 -8.21535051e-01  5.19515634e-01  3.08434010e-01
 -1.92264485e+00 -3.46893549e+00  1.54481041e+00 -9.15210009e-01

In [None]:
pos_and_neg_words_string = ""
for word in pos_and_neg_words:
  pos_and_neg_words_string = pos_and_neg_words_string + ' ' + word
print(pos_and_neg_words_string)

 amorezat maturitate concepere intelect inaugurare recunoscător inițiat breaz slăvit vioiciune maxim prietenie cumsecade pact iscusit benedictin aievea întrunire conștientizare chivot expert angajament consolidare optimism securitate admisibil virtute exclamare totdeauna inovație consistent liniște cunoscut atenție imaginație persistent conform înfrățire favorabil fluid glumă caritabil vrednicie inefabil valoare jubileu conștient proaspăt antrenant franc afectivitate falnic clemență atotcuprinzător bucurie pasiune superior roditor alintă inepuizabil reactivare imun coaliție rânduiala adevărat autocunoaștere sacrament precizie viitor voluntariat fenomenal arhimandrit accelerare harnic devotat proactiv diversitate atemporal titan bunăvoie liber infinitate cuprinzător detașabil susținere imaculat vitalitate familiar contemporan ghid amiciție rezonabil productiv a ospăț promisiune aclamă vesel echivalență franciscan celestin individualitate beatitudine cultură fantastic favoare capabil cri

In [None]:
half_length = len(labels_string) // 2
truncated_labels_string = labels_string[:half_length]



In [None]:
# for nu direct nlp
rolegal_nlp = nlp(truncated_labels_string)

In [None]:
print(legal_final_emb)

                   0         1         2         3         4         5    \
sombra       -0.542646 -0.197965  1.189728 -0.740636 -0.144118 -0.617767   
spohr        -0.698943  0.595282  0.084374 -0.942312 -0.095098 -0.452018   
praetoria    -0.797701  1.015233 -0.925890  2.599955  0.700998 -0.183297   
contorizează -0.065874  1.259961 -1.387077  1.445369  1.440220 -0.900580   
berlescu     -0.559310  1.658750 -0.121888 -0.223567 -0.477571 -1.203011   
...                ...       ...       ...       ...       ...       ...   
plămânărică  -0.545645 -0.497972 -0.911035 -0.544014  0.914304 -0.213687   
necioplit    -0.006985 -0.202643 -1.572850  0.232446 -0.193052  0.294572   
miliardari   -1.423294 -0.928211  1.760962  1.128595 -0.730243  0.016389   
civice       -2.489148 -0.154794  0.500749 -3.316894 -0.005306  0.431341   
armonios     -0.740129 -1.010160  0.031557 -1.234540  0.941985 -0.559889   

                   6         7         8         9    ...       270       271  \
sombra

In [None]:
rolegal_embeddings = [token.vector for token in rolegal_nlp]

In [None]:
labels = []
rows = []
for token, embedding in zip(rolegal_nlp, rolegal_embeddings):
    labels.append(token.text)
    values = np.array([float(x) for x in embedding], 'f')
    rows.append(values)
arr = np.vstack(rows)
legal_final_emb = pd.DataFrame(arr, index=labels, dtype='f')

In [None]:
missing_pos_words = set(pos_words) - set(legal_final_emb.index)
missing_neg_words = set(neg_words) - set(legal_final_emb.index)

print("Missing positive words:", missing_pos_words)
print("Missing negative words:", missing_neg_words)

# Update lexicons to remove missing words
pos_words = list(set(pos_words) - missing_pos_words)
neg_words = list(set(neg_words) - missing_neg_words)

# Now try extracting vectors again
pos_vectors = legal_final_emb.loc[pos_words].dropna()
neg_vectors = legal_final_emb.loc[neg_words].dropna()

Missing positive words: {'integralitate', 'contemplație', 'cutezător', 'amorezat', 'cruciuliță', 'concepere', 'confrate', 'cuminecătură', 'demonstrabil', 'constanță', 'compătimire', 'avuție', 'emancipat', 'apostolat', 'binemeritat', 'cristelniță', 'briliant', 'adaptiv', 'prevestire', 'diligență', 'haios', 'breaz', 'amiabil', 'vioiciune', 'slăvit', 'statornicie', 'idealizare', 'evocator', 'opulență', 'gingaș', 'coexistență', 'compatriot', 'zămislire', 'dexteritate', 'cugetare', 'conferire', 'ceremonios', 'destoinic', 'melodios', 'orânduire', 'fairplay', 'providențial', 'verosimil', 'nostim', 'fler', 'izbăvire', 'perspicace', 'aplicativ', 'întinerire', 'aptitudine', 'aievea', 'diafan', 'fulminant', 'facilitare', 'clarviziune', 'congruență', 'inimioară', 'părintesc', 'conștiinciozitate', 'copilăresc', 'flamură', 'copilăros', 'agheasmă', 'paternal', 'exclamare', 'faptic', 'amabilitate', 'concordă', 'neînchipuit', 'potențialitate', 'tovărășie', 'omniscient', 'dârz', 'senzual', 'îngăduință',

In [None]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [None]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

In [None]:
model = SGDClassifier(loss='log', random_state=0, max_iter=100)
model.fit(train_vectors, train_targets)



In [None]:
accuracy_score(model.predict(test_vectors), test_targets)

0.6970954356846473

In [None]:
def vecs_to_sentiment(vecs):
    predictions = model.predict_log_proba(vecs)
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = legal_final_emb.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

words_to_sentiment(test_labels).iloc[:20]

  return np.log(self.predict_proba(X))


Unnamed: 0,sentiment
nigra,-5.738278
importanță,17.199767
măgar,-5.623299
impulsiv,-9.367787
euharistie,5.377669
celest,4.301835
avansat,8.463152
rogue,13.147607
concubină,-8.22612
categoric,-6.09974


In [None]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.


def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

In [None]:
print(text_to_sentiment("răutate"))
print(text_to_sentiment("pozitiv"))
print(text_to_sentiment("homosexual"))
print(text_to_sentiment("Bunătate"))



-10.148379047193002
0.9229752219709173
-4.047271816328434
25.971605195031852


In [None]:
#extrag embedding din jubert pt cuvinte (input un singur cuv)
from transformers import AutoModel, AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("readerbench/jurBERT-base")
model = AutoModel.from_pretrained("readerbench/jurBERT-base")
inputs = tokenizer("tigan", return_tensors="pt")
outputs = model(**inputs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/212k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

In [None]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2021, -0.2255,  0.2334,  ..., -0.0308, -0.0193,  0.1277],
         [ 0.1893,  0.0855,  0.2083,  ...,  0.3460, -0.2585, -0.5265],
         [-0.1888,  0.1742,  0.4049,  ...,  0.4532, -0.3004, -0.3409],
         [-0.1794,  0.1883,  0.3737,  ...,  0.5110,  0.0917, -0.0911]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.5659, -0.2788,  0.4922, -0.8914, -0.3829,  0.1968, -0.4981,  0.2471,
         -0.4172,  0.9456, -0.0493, -0.9400,  0.3516, -0.3494,  0.8038, -0.2519,
          0.6748, -0.2893, -0.9357, -0.4496, -0.9667,  0.3111, -0.3528, -0.9244,
         -0.8257,  0.2901,  0.3113, -0.5230, -0.3764,  0.4360,  0.4629,  0.3085,
          0.7999, -0.5097, -0.3736,  0.3191,  0.3415,  0.2074,  0.0344, -0.4482,
         -0.2386, -0.2526,  0.4849, -0.4552,  0.8423, -0.4346, -0.5110, -0.5223,
          0.2401, -0.4787, -0.3525,  0.9060,  0.4517,  0.2845,  0.2989, -0.7436,
          0.8646, -0.3246,