<a href="https://colab.research.google.com/github/saverin0/llms_workshops_files/blob/main/oxford_llm_workshop_6_Sentiment_analysis_with_classic_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install datasets==3.2.0

In [2]:
import spacy  # this is where we import spacy. It is preinstalled on Gooogle colab so no need to install it manually (unless you need some specific version)
import itertools
from datasets import load_dataset
import pandas as pd
import typing as tp

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
data = {
    "text": [
        "I love this product. It is amazing!",
        "This movie is terrible. I didn't like it.",
        "It is such a great day today.",
        "The food at that fancy restaurant was awful.",
    ],
    "label": [0, 1, 0, 1],
}

test_df = pd.DataFrame(data)

In [5]:
test_df

Unnamed: 0,text,label
0,I love this product. It is amazing!,0
1,This movie is terrible. I didn't like it.,1
2,It is such a great day today.,0
3,The food at that fancy restaurant was awful.,1


In [6]:
condition = test_df["label"] == 1  # return rows where label == 1

test_df[condition]

Unnamed: 0,text,label
1,This movie is terrible. I didn't like it.,1
3,The food at that fancy restaurant was awful.,1


In [7]:
condition

Unnamed: 0,label
0,False
1,True
2,False
3,True


In [8]:
# this may help: https://stackoverflow.com/questions/17071871/how-do-i-select-rows-from-a-dataframe-based-on-column-values


def get_texts_by_label(df: pd.DataFrame, label: int) -> tp.List[str]:
    """
    filter data by label and column
    """
    condition = df["label"] == label
    return df["text"][condition]

In [9]:
expected_output_label_1 = [
    "This movie is terrible. I didn't like it.",
    "The food at that fancy restaurant was awful.",
]
expected_output_label_0 = [
    "I love this product. It is amazing!",
    "It is such a great day today.",
]

In [10]:
assert set(get_texts_by_label(test_df, 0)) == set(expected_output_label_0)

In [11]:
assert set(get_texts_by_label(test_df, 1)) == set(expected_output_label_1)

In [12]:
text = "I love this product. It is amazing!"

In [13]:
type(text)

str

In [14]:
doc = nlp(text)

In [15]:
expected_output_label_1

["This movie is terrible. I didn't like it.",
 'The food at that fancy restaurant was awful.']

In [16]:
[len(text) for text in expected_output_label_1]

[41, 44]

In [17]:
def convert_texts_to_spacy_docs(texts: tp.List[str]) -> tp.List[spacy.tokens.doc.Doc]:
    """
    Wrap each text into Spact Doc object.
    """
    return [nlp(text) for text in texts]

In [18]:
docs = convert_texts_to_spacy_docs(expected_output_label_0)

assert all(isinstance(doc, spacy.tokens.doc.Doc) for doc in docs)

In [19]:
# demonstration

for token in doc:
    print(token.text)

I
love
this
product
.
It
is
amazing
!


In [20]:
def get_tokens_from_single_doc(
    doc: spacy.tokens.doc.Doc, lemmatize: bool = False
) -> tp.List[str]:
    """
    get strin representation of tokens from the doc. Add a lemmatize param
    to apply lemmatisation of tokens.
    """
    if lemmatize:
        return [token.lemma_ for token in doc]
    return [token.text for token in doc]


sample_doc = docs[0]  # this will work if you coreeclty solved previous task

tokens = get_tokens_from_single_doc(sample_doc, lemmatize=False)
assert all(isinstance(token, str) for token in tokens)

In [21]:
# demosntration of lemmatizer

get_tokens_from_single_doc(nlp("these birds are beautiful"), lemmatize=True)

['these', 'bird', 'be', 'beautiful']

### 4) Extract tokens from a list of docs.

Given a list of docs (e.g. from task 2, extract tokens from each doc and return list where each element is a list of tokens for given doc.

Expected output should be in the following format ```tokens_from_docs = [["it", "is", "good"], ["this", "movie", "is", "the", "best"]]```. This is a reference examples and tokens here are just for reference. Use it as a guide to understand better type of expected output.

In [22]:
def extract_tokens_from_docs(
    docs: tp.List[spacy.tokens.doc.Doc],
) -> tp.List[tp.List[str]]:
    """
    get tokens from every doc from input docs.
    """
    return [get_tokens_from_single_doc(doc) for doc in docs]


tokens_from_docs = extract_tokens_from_docs(docs)
assert len(tokens_from_docs) == len(docs)

In [23]:
tokens_from_docs

[['I', 'love', 'this', 'product', '.', 'It', 'is', 'amazing', '!'],
 ['It', 'is', 'such', 'a', 'great', 'day', 'today', '.']]

In [24]:
def flatten_tokens(texts: tp.List[tp.List[tp.Any]]) -> tp.List[tp.Any]:
    """
    flatten tokens into single list.
    """
    return list(itertools.chain(*texts))


tokens_from_docs = [["it", "is", "good"], ["this", "movie", "is", "the", "best"]]

flattened_tokens = flatten_tokens(tokens_from_docs)

expected_output = ["it", "is", "good", "this", "movie", "is", "the", "best"]

assert set(flattened_tokens) == set(expected_output)
assert len(flattened_tokens) == len(expected_output)

In [25]:
# tip: a simple way is to use ```collections.Counter``` object


def get_word_counts(tokens: tp.List[tp.Any]) -> tp.Mapping[str, int]:
    """
    Compute word frequences
    """
    counts = {}
    for token in tokens:
        if token in counts.keys():
            counts[token] += 1
        else:
            counts[token] = 1
    return counts


flattened_tokens = ["it", "is", "good", "this", "movie", "is", "the", "best"]

counts = get_word_counts(flattened_tokens)

In [26]:
expected_counts = {
    "it": 1,
    "is": 2,
    "good": 1,
    "this": 1,
    "movie": 1,
    "the": 1,
    "best": 1,
}

assert counts == expected_counts

In [27]:
def get_top_n_frequent_tokens(
    tokens_counts: tp.Mapping[str, int], top_n: int
) -> tp.List[tp.Any]:
    """
    get top frequent tokens.
    """
    return dict(sorted(counts.items(), key=lambda x: x[1], reverse=True)[:top_n])


top_3_freq = get_top_n_frequent_tokens(counts, 3)

expected_top_3_freq = {"is": 2, "it": 1, "good": 1}
assert top_3_freq == expected_top_3_freq

In [28]:
def extract_nouns_from_text(text: str) -> tp.List[str]:
    """
    convert text to Doc object and filter it by Nouns.
    """
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == "NOUN"]


text = "this sofa is so comfortable but delivery service was not good"

extracted_nouns = extract_nouns_from_text(text)

assert set(extracted_nouns) == {"sofa", "delivery", "service"}

In [29]:
def extract_adj_noun_pairs_from_text(text: str) -> tp.List[str]:
    """
    Convert ted to Doc object and use linguistic dependency parser
    to extract dependent adjectives for existing nouns.
    Return adjective + noun pairs
    """
    doc = nlp(text)
    adj_noun_pairs = []

    for token in doc:
        adj = None
        if token.pos_ == "NOUN":
            noun = token.text
            for child in token.children:
                if child.dep_ == "amod":
                    adj = child.text
                    break

            if adj:
                adj_noun_pairs.append(adj + " " + noun)
    return adj_noun_pairs

In [30]:
text = "this beautiful sofa impressed me very much! Its bright color was exactly what I wanted"

extracted_adj_noun_pairs = extract_adj_noun_pairs_from_text(text)

assert set(extracted_adj_noun_pairs) == {"beautiful sofa", "bright color"}

In [31]:
imdb = load_dataset("imdb", split="test")

In [36]:
imdb = imdb.to_pandas()

In [37]:
imdb = imdb.sample(500, random_state=42).reset_index(drop=True)

In [38]:
imdb.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,266
1,234


In [39]:
def make_analysis(df: pd.DataFrame, label: int) -> None:
    """
    Combine all together and print overall analytical report for given label
    """
    print(f"starting analysis for the label {label}")
    texts = get_texts_by_label(imdb, label)

    docs = convert_texts_to_spacy_docs(texts)
    tokens = extract_tokens_from_docs(docs)
    bag_of_words = flatten_tokens(tokens)
    print(f"total number of extracted words is {len(bag_of_words)} \n")
    word_counts = get_word_counts(bag_of_words)
    top_20_freq_tokens = get_top_n_frequent_tokens(word_counts, 20)
    print("top 20 frequent tokens: \n")
    print(top_20_freq_tokens)
    print("\n")

    nouns = [extract_nouns_from_text(text) for text in texts]
    nouns = flatten_tokens(nouns)
    nouns_counts = get_word_counts(nouns)
    top_20_frequent_nouns = get_top_n_frequent_tokens(nouns_counts, 20)
    print("top 20 frequent nouns: \n")
    print(top_20_frequent_nouns)
    print("\n")

    adj_noun_pairs = [extract_adj_noun_pairs_from_text(text) for text in texts]
    adj_noun_pairs = flatten_tokens(adj_noun_pairs)
    adj_noun_counts = get_word_counts(adj_noun_pairs)
    top_20_frequent_adj_nouns = get_top_n_frequent_tokens(adj_noun_counts, 20)
    print("top 20 frequent adj + nouns pairs: \n")
    print(top_20_frequent_adj_nouns)
    print("\n")

In [40]:
make_analysis(imdb, 0)

starting analysis for the label 0
total number of extracted words is 76372 

top 20 frequent tokens: 

{'is': 2, 'it': 1, 'good': 1, 'this': 1, 'movie': 1, 'the': 1, 'best': 1}


top 20 frequent nouns: 

{'is': 2, 'it': 1, 'good': 1, 'this': 1, 'movie': 1, 'the': 1, 'best': 1}


top 20 frequent adj + nouns pairs: 

{'is': 2, 'it': 1, 'good': 1, 'this': 1, 'movie': 1, 'the': 1, 'best': 1}




In [41]:
make_analysis(imdb, 1)

starting analysis for the label 1
total number of extracted words is 61887 

top 20 frequent tokens: 

{'is': 2, 'it': 1, 'good': 1, 'this': 1, 'movie': 1, 'the': 1, 'best': 1}


top 20 frequent nouns: 

{'is': 2, 'it': 1, 'good': 1, 'this': 1, 'movie': 1, 'the': 1, 'best': 1}


top 20 frequent adj + nouns pairs: 

{'is': 2, 'it': 1, 'good': 1, 'this': 1, 'movie': 1, 'the': 1, 'best': 1}




In [42]:
texts = get_texts_by_label(imdb, 1)

In [43]:
[x for x in texts if "bothersome man" in x]

['Where the heck is Andreas(Trond Fausa Aurvaag), exactly? Heaven? Hell? A parallel universe? When the bothersome man steps off the subway platform and meets an onrushing train, his next conscious moment occurs on a bus; riding solo, the newest arrival, in a dead netherworld where all the suicides go. Dressed as he was at the time of his sudden departure from the corporeal biosphere, Andreas is greeted by an official man, who processes and transports the bothersome man from the barren flatlands to a city, if the eyeballs work, is a dead ringer for the sort of urban landscapes that he once inhabited, if memory serves him right. Andreas retains the look of a sleepwalker in a trance, a man estranged from people and objects, struggling to find his bearings; at home, or rather, his assigned apartment; or at work, where the bothersome man is randomly designated as an accountant for an independent contractor. Havard(Johannes Joner), his boss, tells him, "You\'ll get used to it," which covers 