In [13]:
!pip install datasets



In [14]:
!pip install markovify



In [15]:
!pip install textblob



In [16]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


1. Generare în limba română: Implementați un sistem care transformă un text (corpus) într-un lanț Markov și folosiți-l pentru a generare un proverb sau o poezie în limba română (folosiți fișierele proverbRo.txt sau poezieRo.txt)
 - Varianta 1 – Implementați un lanț Markov cu o singură stare

In [17]:
import random
from collections import defaultdict
import re

def loadText(filename):
  with open(filename, 'r', encoding='utf-8') as file:
    text = file.read().replace('\n', ' ')
  sentences = re.split(r'\.\s*', text.strip('.'))
  words = []
  for sentence in sentences:
    words.extend(sentence.split())
  return words

def createMarkovChain(words):
  chain = defaultdict(list)
  for currentWord, nextWord in zip(words, words[1:] + ['.']):
    chain[currentWord].append(nextWord)
  chain[words[-1]].append('.')
  return chain

def generateText(chain, start, length, maxAttempts=10):
  attempt = 0
  while attempt < maxAttempts:
    currentWord = start.capitalize()
    text = [currentWord]
    for _ in range(length - 1):
      if currentWord not in chain or not chain[currentWord]:
        break
      currentWord = random.choice(chain[currentWord])
      if currentWord == '.':
        if text[-1] != '.':
          text.append(currentWord)
        break
      text.append(currentWord.lower())
    if len(text) >= length or text[-1] == '.':
      return ' '.join(text).rstrip(' .') + '.'
    attempt += 1
    start = random.choice(list(chain.keys()))
  return "Cannot generate."

In [18]:
words = loadText('proverbe.txt')
markovChain = createMarkovChain(words)
start = random.choice(words)
generatedText = generateText(markovChain, start, 10)
print(generatedText)

Buturuga mica rastoarna carul inaintea batranilor sa se plateste prostia.


 - Varianta 2 – Implementați un lanț Markov cu n-stări

In [19]:
def loadText2(filename):
  with open(filename, 'r', encoding='utf-8') as file:
    text = file.read().replace('\n', ' ')
  words = text.replace('.', '').replace('!', '').lower().split()
  return words

def createMarkovChain2(words, n):
  chain = defaultdict(list)
  for i in range(len(words) - n):
    key = tuple(words[i : i+n])
    next = words[i+n]
    chain[key].append(next)
  for key in chain:
    if '.' not in chain[key]:
      chain[key].append('.')
  return chain

def generateText2(chain, start, length):
  if not start or start not in chain:
    return "Invalid start."

  words = list(start)
  words[0] = words[0].capitalize()
  while len(words) < length:
    currentKey = tuple(words[-len(start):])
    if currentKey in chain and chain[currentKey]:
      nextWord = random.choice(chain[currentKey])
      if nextWord == '.':
        continue
      words.append(nextWord)
    else:
      start = tuple(random.choice([key for key in chain.keys()]))
      words.extend(start)
  return ' '.join(words).rstrip(' .') + '.'

In [28]:
words = loadText2('proverbe.txt')
markovChain = createMarkovChain2(words, 3)
start = tuple(random.choice([key for key in markovChain.keys()]))
generatedText = generateText2(markovChain, start, 10)
print(generatedText)

Rai dati cezarului cu ciorba sufla si in iaurt cine.


2. Generare în limba engleză:
 - a. Folosiți biblioteca markovify pentru a genera o strofă de poezie în limba engleză folosind unul din următoarele corpus-uri:

In [21]:
from datasets import load_dataset
import markovify

dataset = load_dataset("biglam/gutenberg-poetry-corpus")

poetryTexts = dataset["train"]["line"]

corpusText = "\n".join(poetryTexts)

textModel = markovify.NewlineText(corpusText)

poetry = ""
while not poetry:
  poetry = "\n".join([textModel.make_sentence() for _ in range(4)])

print(poetry)

With the winds and the pure wild-cherry in bloom!
Far may the stars had disappear'd,
Beware, I say, but only dream.
island. The rich be scaddit.


 - b. Calculați emoția textului generat, puteți folosi una din următoarele resurse:

  - Natural Language Toolkit (nltk) SentimentIntensityAnalyzer
  - TextBlob sentiment

In [22]:
from textblob import TextBlob

blob = TextBlob(poetry)

polarity = blob.sentiment.polarity

if polarity > 0:
    sentiment_category = "Positive"
elif polarity == 0:
    sentiment_category = "Neutral"
else:
    sentiment_category = "Negative"

print("Sentiment Polarity:", polarity)
print("Sentiment Category:", sentiment_category)

Sentiment Polarity: 0.18571428571428572
Sentiment Category: Positive


 - c. Pentru a adresa limitările de creativitate în poezia generată înlocuiți aleator cuvinte cu sinonime. Se cere ca sinonimele să fie obținute folosind embedding-uri. (i.e. Cuvântul ales e transformat în forma sa embedded și se alege embedding-ul cel mai apropiat care este convertit la string)

In [27]:
import spacy
import numpy as np

nlp = spacy.load("en_core_web_md")

doc = nlp(poetry)

def find_most_similar(word, topn=5):
    queried_token = nlp.vocab[word]
    if not queried_token.has_vector:
        return word

    similarities = []
    for token in nlp.vocab:
        if token.has_vector and token.is_lower == queried_token.is_lower and token.text != word:
            similarity = np.dot(queried_token.vector, token.vector) / (np.linalg.norm(queried_token.vector) * np.linalg.norm(token.vector))
            similarities.append((token, similarity))

    similarities = sorted(similarities, key=lambda item: -item[1])
    return similarities[0][0].text if similarities else word

revised_text = []
for token in doc:
    if token.has_vector and not token.is_stop and not token.is_punct:
        similar_word = find_most_similar(token.text)
        revised_text.append(similar_word)
    else:
        revised_text.append(token.text)

revised_text = " ".join(revised_text)

print("Text original:", poetry)
print()
print("Text modificat:", revised_text)


Text original: With the winds and the pure wild-cherry in bloom!
Far may the stars had disappear'd,
Beware, I say, but only dream.
island. The rich be scaddit.

Text modificat: With the island and the rich bloom - bloom in cherry ! 
 Goin' may the o'clock had disappear'd , 
 C++ , I say , but only somethin . 
 where . The pure be scaddit .


 - e. Calculați metrica BLEU (Bilingual Evaluation Understudy Score) pentru poezia aleasă

In [26]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

reference_tokens = [word_tokenize(poetry)]
generated_tokens = word_tokenize(revised_text)

bleu_score = sentence_bleu(reference_tokens, generated_tokens)

print("BLEU score:", bleu_score)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


BLEU score: 0.28846838825512133
