# Vaje 12: Analiza besedil in vpetja podatkov

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

Uporabljali bomo podatkovno množico recenzij filmov iz IMDB-ja

In [None]:
!pip install datasets

In [2]:
from datasets import load_dataset

# Naloži podatkovno množico IMDb
imdb_dataset = load_dataset('imdb')

# Izbremo 20000 podatkov iz učne množice
data = pd.DataFrame(imdb_dataset['train'].shuffle(seed=42).select(range(20000)))

# Ciljno vrednost spremenimo iz True v "pos" in iz False v "neg"
data['label'] = data['label'].apply(lambda x: 'pos' if x else 'neg')

data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,label
0,There is no relation at all between Fortier an...,pos
1,This movie is a great. The plot is very true t...,pos
2,"George P. Cosmatos' ""Rambo: First Blood Part I...",neg
3,In the process of trying to establish the audi...,pos
4,"Yeh, I know -- you're quivering with excitemen...",neg


# Predprocesiranje: Čiščenje podatkov

Ena izmed najbolj pomembnih nalog pri analizi besedil je predprocesiranje podatkov. Besedila moramo torej spraviti v format, ki je ustrezen za algoritme strojnega učenja (torej matrike/tenzorje). Poglejmo si torej nekaj tehnik predprocesiranja, ki so skoraj obvezne, ko delamo z besedili:

1. **Tokenizacija**: Čeprav na besede ponavadi gledamo kot na celoto, so le-te ponavadi sestavljene iz več delov, ki so skupni več besedam (naprimer: nogomet, rokomet, ...). Da bolje ujamemo te povezave med besedami, jih zato ponavadi razbijemo na manjše dele. Dodatno nam tokenizacija omogoča sestavljanje/kodiranje novih (še ne videnih) besed in manjšo množico vhodnih besed.

2. **Lowercasing**: Da se izognemu razlikovanja med isto besedo, ko je napisana z veliko in malo začetnico, vse velike črke pretvorimo v male brez da bi izgubili veliko informacije o strukturi in vsebini besedila.

3. **Odstranjevanje besed brez pomena**: Če pogledamo porazdelitev besed v besedilu opazimo, da so nekatere besede, ki se pogosto pojavijo brez veliko pomena (v angleščini naprimer "the", "is", "and"). Ker te besede pogosto ne prispevajo k naši analizi, jih pogosto odstranimo.

4. **Odstranjevanje ločil**: Pogosto odstranimo tudi ločila, saj ponavadi ne dodajo nič informacije, ki bi bila za nas pomembna.

5. **Normalizacija**: Posebaj v slovenščini, se lahko beseda pojavi v zelo različnih oblikah, na primer z različnimi končnicami. Z lematizacijo vse oblike besede poenotimo v eno, imenovano lema.


In [3]:
!pip install nltk



In [3]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Naložimo potrebne datoteke, ki nam bodo pomagale pri predprocesiranju
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializiramo lematizaro in stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sebastianmeznar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sebastianmeznar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sebastianmeznar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Sestavimo funkcijo, ki bo besedilo predprocesiralo
def preprocess_text(text):
    # Besedilo tokeniziramo
    words = word_tokenize(text.lower())  # Črke spremenimo v male

    # Znebimo se ločil
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words if word.isalpha()]

    # Odstranimo pogoste besede z malo informacije
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Besedilo lematiziramo 
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Namesto lematizacije lahko s stemmingom odstanimo prefixe in suffixe besed
    # stemmed_words = [stemmer.stem(word) for word in words]

    # Predelane besede združimo nazaj v string
    preprocessed_text = ' '.join(lemmatized_words)
    return preprocessed_text

In [5]:
# Pokličemo funckijo na podatkih
data['clean_text'] = data['text'].apply(preprocess_text)

Preverimo, kako sedaj besedilo zgleda.

In [6]:
data['clean_text'][0]

'relation fortier profiler fact police series violent crime profiler look crispy fortier look classic profiler plot quite simple fortier plot far complicated fortier look like prime suspect spot similarity main character weak weirdo clairvoyance people like compare judge evaluate enjoying funny thing people writing fortier look american hand arguing prefer american series maybe language spirit think series english american way actor really good funny acting superficial'

In [7]:
# Podatke razdelimo na učno in testno množico
X_train, X_test, y_train, y_test = train_test_split(data[['clean_text', 'text']], data['label'], test_size=0.2, random_state=42)

## Naloga 1: Pretvarjanje besedil v numerične spremenljivke z TD-IDF-jem

Term Frequency-Inverse Document Frequency (TD-IDF) je numerična statistična metoda, ki se pri procesiranju naravnega jezika uporablja za ocenjevanje pomembnosti besed v dokumentu znotraj zbirke dokumentov (korpusa).

Izračuna se na sledeč način:

1. **Term Frequency (TF):** meri kako pogosto se term (beseda) pojavi v besedilu.To izračunamo tako, da število pojav besede delimo s številom besed v dokumentu. Ideja je, da so bolj pogoste besede v besedilu bolj pomembne.

   $ \text{TF}(t, d) = \frac{\text{Število pojavitev terma } t \text{ znotraj dokumenta } d}{\text{Število vseh pojavitev termov v dokumentu } d} $

2. **Inverse Document Frequency (IDF):** Ta del izračuna pomembnost posamezne besede znotraj zbirke dokumentov. Ideja za tem je, da so besede, ki se redko pojavijo znotraj zbirke besedil bolj pomembne.
   $ \text{IDF}(t, D) = \log{\left(\frac{\text{Število dokumentov v korpusu } D}{\text{Število dokumentov, ki vsebuje term } t}\right)} + 1$

3. **TF-IDF:** Produkt TF in IDF-ja. Da visoko težo besedam, ki se pogosto pojavijo znotraj sprecifičnega dokumenta, a redko znotraj zbirke dokumentov. Besede, ki se pojavijo v veliko dokumentih imajo torej nizko težo.
   $ \text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D) $

Using TF-IDF, you can represent each document as a numerical vector where each dimension represents a term and its importance in that document. This technique is widely used in information retrieval, text mining, and search engine optimization, helping to determine the relevance of a document to a query or to analyze the significance of terms within documents.

S pomočjo razreda TfidfVectorizer pretvori predelana besedila v vektorje in preveri točnost Logistične regresije.

In [8]:
# Model building: Choose and train a classifier
vectorizer = TfidfVectorizer()  # Use TF-IDF vectorizer for text to numerical feature conversion
X_train_vec = vectorizer.fit_transform(X_train['clean_text'])
X_test_vec = vectorizer.transform(X_test['clean_text'])

In [25]:
print(X_train_vec[0])
print(X_train_vec.shape)
print(X_train_vec[0].nonzero()[1].shape)
print(vectorizer.get_feature_names_out())

  (0, 26948)	0.12197997694243248
  (0, 15564)	0.08720149867673216
  (0, 16152)	0.15991546982424545
  (0, 16823)	0.0443884874444932
  (0, 37959)	0.0627685189461493
  (0, 40833)	0.07481502408279417
  (0, 14558)	0.10420578054405999
  (0, 32434)	0.0861747712062763
  (0, 8788)	0.17390170767620566
  (0, 34792)	0.1114337595826239
  (0, 33044)	0.04492938464553721
  (0, 26674)	0.09278617082914681
  (0, 21515)	0.17147336398733765
  (0, 36543)	0.12972315448674823
  (0, 22475)	0.09896981699184786
  (0, 19348)	0.0566494986219791
  (0, 15748)	0.16440013915737053
  (0, 11981)	0.16875974843430205
  (0, 3188)	0.275688330118949
  (0, 3003)	0.11231047420134478
  (0, 1677)	0.1368198101436671
  (0, 10292)	0.2751727559467969
  (0, 23911)	0.17252736452640924
  (0, 30107)	0.10555478302023709
  (0, 27460)	0.2669110184294683
  (0, 21428)	0.12619990003080675
  (0, 12264)	0.11098948986499531
  (0, 17678)	0.09845642664564698
  (0, 5432)	0.17635843477964916
  (0, 14945)	0.17781705735745004
  (0, 19079)	0.2605027169

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_vec, y_train)
logistic_predictions = logistic_model.predict(X_test_vec)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", logistic_accuracy)

# Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_vec, y_train)
rf_predictions = rf_model.predict(X_test_vec)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

Logistic Regression Accuracy: 0.88625
Random Forest Accuracy: 0.8585


## Naloga 2: Pretvarjanje besedil v numberične spremenljivke z vpetji

Na 8ih vajah smo si ogledali samokodirnike, ki stisnejo originalne podatke v vektorski prostor nizke dimenzije, imenovan latentni prostor. Preslikavi iz originalnega v latentni prostor imenujemo vpetje. Z vpetjem podatkov zmanjšamo razsežnost podatkov in (v primeru dobrega vpetja) ujamemo skrite povezave med podatki. Najbolj znan primer povezav oz lastnosti, ki se pojaviju v vpetju je: od vektorja kralj odštejemo vektor moški in prištejemo vektor ženska in dobimo vektor, ki se dekodira v besedo kraljica.

En izmed najpopularnejših pristopov za vpetje besed v vektorski prostor je Word2Vec. Ideja za pristopom je, da se besede s podobnim pomenom pojavijo v podobnih kontekstih in morajo zato biti njihove predstavitve v latentnem prostoru blizu.

Obstajata dve glavni arhitekturi za Word2vec:

1. **Continuous Bag-of-Words (CBOW):** Model napoveduje verjetnost ciljne besede glede na dane besedi v njeni okolici. Če na primer podamo besede "mačka sedi na", bo model napovedal "preprogi".

2. **Skip-gram:** model deluje v obratni smeri. Na vhod dobi besedo "preproga" in poskusi napovedati besede v okolici, torej "mačka", "sedi", "na"

Oba modela uporabljata usmerjeno nevronsko mrežo z enim samim skritim slojem, s katero se učimo uteži za predstavitev besed z vektorjem. Uteži skritega sloja postanejo vpetja oziroma predstavitev besed z vektorjem.

Word2Vec je imel na procesiranje naravnega jezika velik vpliv, saj se njegova vpetja lahko uporabi za različne naloge, ki so povezane z besedili. En izmed razlogov za to je, da zelo dobro ujame semantične povezave med besedami.

2.a: Natreniraj model Word2Vec iz knjižnjice gensim ([gensim.models.Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html)). Uporabi parametre: vector_size=100, window=5, min_count=1, workers=4, epochs=10. Pred trening vsako besedilo v učni in testni množici razreži na besede s funkcijo `split()`

In [26]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.4-py3-none-any.whl.metadata (23 kB)
Downloading gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading smart_open-7.0.4-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-7.0.4


In [27]:
from gensim.models import Word2Vec

tokenized_train_text = [text.split() for text in X_train['clean_text']]
tokenized_test_text = [text.split() for text in X_test['clean_text']]

# Train Word2Vec model
w2v_model = Word2Vec(tokenized_train_text, vector_size=100, window=5, min_count=1, workers=4, epochs=10)

2.b: Model Word2Vec si shrani vse besede in pripadajoče vektorje v spremenljivki wv. Vse besede v vokabularju lahko dobimo v spremenljivki `model.wv.index_to_key`, vektorje pa z ukazom `model.wv[beseda]`. Sestavi slover iz prvih petih besed in pripadajočih vektorjev ter gesla in vrednosti v slovarju izpiši.

In [28]:
# Get all words and their vectors in the Word2Vec model's vocabulary
all_words = w2v_model.wv.index_to_key
word_vectors = {word: w2v_model.wv[word] for word in all_words[:5]}

# Print the word vectors
for word, vector in word_vectors.items():
    print(f"Word: {word}")
    print(f"Vector: {vector}")
    print("\n")  # Add a newline for better readability

Word: br
Vector: [ 1.0304588  -0.99352753  1.2502333   0.9991075  -0.37750974 -0.95246685
 -0.08215574  1.0067925  -0.45983577  0.72306514  0.04959567 -0.9132974
 -0.8457528   1.0041964  -0.9317955  -1.4430221  -0.28187922  0.43200836
 -0.6437984   0.44460648  0.02092529 -0.8753164   1.9030688   1.523923
  1.1657525  -0.16998577  0.83281416 -0.11189803 -0.8396603  -0.13254046
  0.28827408  1.3844548   0.8737769  -0.33922812 -1.3191979  -0.8705202
 -0.72890854 -0.7306533  -0.64392984 -1.870423    0.11011513  1.8222171
 -0.08509613 -0.05540688 -1.3797641  -1.2390989  -2.1060123  -0.5156891
  0.63929766  0.5577973  -0.14040232  0.20853749  1.2979497  -0.5068703
  0.7383669  -0.13384515  0.40067765 -0.8670849  -0.6145245   1.0474917
 -0.5265668  -1.2710965  -1.8136395  -2.9256155   0.8904954  -0.3156277
 -0.4164132  -0.91932905 -1.3663757   0.64272034  1.522156    0.68175864
 -0.07323485 -0.2420179  -0.678943    0.7326429   0.625479   -0.840906
  0.49216625 -0.726544   -0.08493825 -0.45517

2.c: Ukazom `model.wv.most_similar` in parametrom topn, lahko najdemo n najbolj podobnih besed znotraj slovarja. Izpiši prvih 10 najbližjih besed besedi "cat"

In [29]:
# Find similar words to a specific word
similar_words = w2v_model.wv.most_similar('cat', topn=10)

# 'word' is the word for which you want to find similar words, and 'topn' specifies the number of similar words to retrieve

# Print the similar words and their similarity scores
for word, similarity in similar_words:
    print(f"Similar word: {word}, Similarity: {similarity}")


Similar word: dog, Similarity: 0.7748943567276001
Similar word: mouse, Similarity: 0.7126650810241699
Similar word: bird, Similarity: 0.6762747764587402
Similar word: soup, Similarity: 0.665956437587738
Similar word: flavoured, Similarity: 0.6533246040344238
Similar word: fish, Similarity: 0.6445339918136597
Similar word: satan, Similarity: 0.6432642340660095
Similar word: kitty, Similarity: 0.6415029168128967
Similar word: wolf, Similarity: 0.6397603750228882
Similar word: hat, Similarity: 0.6393377780914307


2.d: Model Word2Vec vsaki besedi priredi vektor. V naši nalogi delamo z besedili, ki so dolga več besed zato moramo te vektorje nekako zagregirat, na primer tako, da vektorje besed znotraj vsakega besedila povprečimo. Definiraj funkcijo, ki bo vsako besedilo iz učne in testne množice spremenila v vektor dolžine 100 (če v besedilu ni nobene besede naj bo to vektor ničel). Z dobljeno množico vektorjev preveri točnost logistične regresije za klasifikacijo iz naloge 1.

In [30]:
# Function to get average Word2Vec representation for a sentence
def get_average_w2v(tokens):
    vector_sum = 0
    count = 0
    for word in tokens:
        if word in w2v_model.wv:
            vector_sum += w2v_model.wv[word]
            count += 1
    if count != 0:
        return vector_sum / count
    else:
        return [0] * 100  # Return zero vector if no word found

# Add Word2Vec representations to DataFrame
X_train_w2v = [get_average_w2v(text) for text in tokenized_train_text]
X_test_w2v = [get_average_w2v(text) for text in tokenized_test_text]

In [31]:
# Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_w2v, y_train)
logistic_predictions = logistic_model.predict(X_test_w2v)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", logistic_accuracy)

# Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_w2v, y_train)
rf_predictions = rf_model.predict(X_test_w2v)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

Logistic Regression Accuracy: 0.8545
Random Forest Accuracy: 0.83025


## Naloga 3: Vnaprej naučeni modeli

Zadnje čase se za procesiranje naravnega jezika uporabljajo predvsem vnaprej naučeni modeli. Vnaprej naučeni modeli so (ponavadi) velike nevronske mreže, ki so naučeni na veliki množici podatkov. Vnaprej naučene modele ponavadi uporabimo kot začetno točko, ki jo dotreniramo za našo nalogo. Ti modeli so uporabni iz več razlogov.

1. **Generalizacija**: Vnaprej naučeni modeli so naučeni na velikih in raznolikih besedilnih korpusih, kar jim omogoča učenje posplošenih reprezentacij jezika. To jim omogoča, da se dokaj dobro izkažejo pri številnih nadaljnjih nalogah brez potrebe dotreniranja za posamezno nalogo.

2. **Učinkovitost virov**: Uporaba Vnaprej naučenih modelov prihrani računalniške vire in čas. Namesto da bi uporabnik modele treniral od začetka, za kar potrebuje precej podatkov in računalniške moči, lahko izkoristi te vnaprej obstoječe, dobro trenirane modele.

3. **Transfer learning**: Vnaprej naučeni modeli omogočajo, da znanje, pridobljeno pri eni nalogi, prenesemo na drugo sorodno nalogo. S dotreniranjem (finetuningom) na določenih naborih podatkov ali nalogah se lahko njihova zmogljivost znatno izboljša z minimalnim dodatnim učenjem.

Dotreniranje se nanaša na postopek, pri katerem se vzame vnaprej naučen model in ga dotrenira na posebnem naboru podatkov ali nalogi. Posledično se njegovi parametri prilagodijo za boljše delovanje v tem posebnem kontekstu. Dotreniranje je pomembno saj:

- **Prilagajanje na posamezno nalogo**: Dotreniranje omogoča modelu, da se prilagodi podatkov ali nalogi.

- **Povečana zmogljivost**: Z dotreniranjem na podatkih, specifičnih za domeno ali nalogo, se lahko model nauči več značilnosti, specifičnih za nalogo, kar izboljša natančnost in učinkovitost za predvideno uporabo.

- **Zmanjšana zahteva po podatkih**: Za dotreniranje modela je pogosto potrebnih manj podatkov kot za učenje modela od začetka. Če začnemo z vnaprej naučenim modelom, se lahko učinkovito učimo iz manjšega nabora podatkov, specifičnega za določeno področje, kar je koristno v scenarijih, kjer je na voljo omejena količina anotiranih podatkov.

In [32]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp310-cp310-many

In [33]:
from transformers import pipeline

# Inicializiramo model za analizo sentimenta
sentiment_analysis = pipeline("sentiment-analysis")

text = "I absolutely love this product! It's fantastic!"

# Naredimo analizo sentimenta za zgornji vzorec
result = sentiment_analysis(text)

# Izpišemo sentiment vzorca in koliko je model "prepričan" v napoved
print(f"Sentiment: {result[0]['label']}, Confidence: {result[0]['score']:.4f}")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Sentiment: POSITIVE, Confidence: 0.9999


3.a: Preveri, če se sentiment prvih 500 primerov sklada s ciljnimi vrednosti (torej, če model vrne "POSITIVE" je napovedana vrednost "pos", če ne "neg").

In [34]:
from tqdm import tqdm

trans_y_pred = []
y_test_reset = y_test.reset_index(drop=True)
n = 500

for test_text in tqdm(X_test['text'][:n]):
    result = sentiment_analysis(test_text[:1500])
    sentiment = result[0]['label']
    trans_y_pred.append('pos' if sentiment == 'POSITIVE' else 'neg')

trans_accuracy = accuracy_score(y_test_reset[:n], trans_y_pred)
print("Transformer Accuracy:", trans_accuracy)

100%|██████████| 500/500 [00:22<00:00, 22.64it/s]

Transformer Accuracy: 0.886





## Dodatna naloga (v ang.): Stanza & POS tagging


Stanza is an NLP library developed by the Stanford NLP Group. It's designed for a wide range of natural language processing tasks, including tokenization, part-of-speech tagging, named entity recognition, dependency parsing, and more. Stanza aims to provide efficient and accurate pre-trained models for various languages.

Key features of Stanza include:
- **Pre-Trained Models**: Stanza comes with pre-trained models for multiple languages, allowing users to perform various NLP tasks without training models from scratch.
- **Ease of Use**: It offers a simple and intuitive API for performing different NLP tasks, making it accessible for both beginners and experienced researchers.
- **Accuracy**: Stanza models are known for their high accuracy in different NLP tasks due to their robust training on extensive datasets.
- **Multiple Languages**: Stanza supports multiple languages, making it suitable for multilingual NLP applications.

Stanza provides state-of-the-art performance in various NLP tasks and continues to evolve with advancements in the field of natural language processing.

### Use Case: Text Analysis with Universal POS Tagging using Stanza

Stanza's Universal POS tagging can be highly beneficial in various text analysis tasks. Let's consider a scenario where you have a dataset of customer reviews for a product. By utilizing Stanza's Universal POS tagging, you can perform the following analysis:

1. **Extracting Key Features**: Identify the key features or attributes of the product mentioned in the reviews by analyzing nouns (NOUN) and adjectives (ADJ) tagged using Stanza. This helps in understanding what aspects of the product are being praised or criticized.

2. **Sentiment Analysis**: Analyze sentiments associated with specific parts of speech. For instance, adjectives (ADJ) often reflect sentiments or opinions. By associating adjectives with their corresponding nouns, you can determine the sentiment expressed towards various product features.

3. **Customer Feedback Categorization**: Categorize customer feedback into different categories based on the identified parts of speech. For instance, categorize reviews mentioning "customer service" (PROPN) separately to analyze the sentiment specifically related to that aspect.

4. **Comparative Analysis**: Compare the frequency and sentiment of different parts of speech across different products or time frames to identify trends and patterns in customer opinions.

By utilizing Stanza's Universal POS tagging, you can effectively extract meaningful insights from textual data, enabling better decision-making and improving products or services based on customer feedback.

### Universal POS Tags
- **ADJ**: Adjective
- **ADP**: Adposition
- **ADV**: Adverb
- **AUX**: Auxiliary
- **CCONJ**: Coordinating conjunction
- **DET**: Determiner
- **INTJ**: Interjection
- **NOUN**: Noun
- **NUM**: Numeral
- **PART**: Particle
- **PRON**: Pronoun
- **PROPN**: Proper noun
- **PUNCT**: Punctuation
- **SCONJ**: Subordinating conjunction
- **SYM**: Symbol
- **VERB**: Verb
- **X**: Other

In [35]:
!pip install stanza

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting stanza
  Downloading stanza-1.8.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Collecting protobuf>=3.15.0 (from stanza)
  Downloading protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting toml (from stanza)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Downloading stanza-1.8.2-py3-none-any.whl (990 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.1/990.1 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.8/302.8 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached toml-0.10.2-py2.py

In [36]:
import stanza

# Download English model (change 'en' to the appropriate language code if needed)
stanza.download('en')

# Initialize the English pipeline
nlp = stanza.Pipeline('en', processors='tokenize,pos')

# Sample customer review
sample_review = "The camera quality is amazing, but the battery life could be better."

# Process the review
doc = nlp(sample_review)

# Extract nouns and adjectives
nouns = []
adjectives = []

for sentence in doc.sentences:
    for word in sentence.words:
        if word.upos == 'NOUN':
            nouns.append(word.text)
        elif word.upos == 'ADJ':
            adjectives.append(word.text)

# Print extracted nouns and adjectives
print("Extracted Nouns:", nouns)
print("Extracted Adjectives:", adjectives)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 38.0MB/s]                    
2024-05-23 14:15:01 INFO: Downloaded file to /home/sebastianmeznar/stanza_resources/resources.json
2024-05-23 14:15:01 INFO: Downloading default packages for language: en (English) ...
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip: 100%|██████████| 527M/527M [00:09<00:00, 55.0MB/s] 
2024-05-23 14:15:12 INFO: Downloaded file to /home/sebastianmeznar/stanza_resources/en/default.zip
2024-05-23 14:15:14 INFO: Finished downloading models and saved to /home/sebastianmeznar/stanza_resources
2024-05-23 14:15:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379k

Extracted Nouns: ['camera', 'quality', 'battery', 'life']
Extracted Adjectives: ['amazing', 'better']


Prirejeno pa vajah Boshko-ta Koloskega (Inteligentni sistemi, FRI)