In [1]:
import pandas as pd
import re
from nltk.stem import PorterStemmer

In [2]:
file_path = "HW1/FPB.csv"
fpb = pd.read_csv(file_path, encoding="ISO-8859-1")
fpb.head()

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


### Text Pre-Processing

In [3]:
# Rename columns for clarity
fpb.columns = ["Sentiment", "Headline"]

In [4]:
import nltk
nltk.download('punkt') # downloads you a model

nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords

# print(stop)

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
ps = PorterStemmer() 
stop = set(stopwords.words('english'))

# return a list of tokens
def pre_processing_by_nltk1(doc, stemming = False, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens if w.lower() not in stop]

In [6]:
fpb.Headline[:5].apply(pre_processing_by_nltk1)

0    [technopolis, plans, develop, stages, area, le...
1    [international, electronic, industry, company,...
2    [new, production, plant, company, would, incre...
3    [according, company, 's, updated, strategy, ye...
4    [financing, aspocomp, 's, growth, aspocomp, ag...
Name: Headline, dtype: object

# Problem 1: Word Embeddings

## a
For each of the following words: production, profit, acquisition, investment, job, re-
port the 10 most similar words based on word2vec and GloVe. Briefly discuss which method seems
more reasonable to you or any other findings. Note: This is an open-ended question. Feel free to
propose new ideas.

### Nearest Neighbor Search

- Most similar words based on word embedding vectors

## Word2Vec

In [7]:
from gensim.models import Word2Vec
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize


In [8]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300') 

In [9]:
wv.most_similar(positive=['production'], topn=10)

[('Production', 0.7706561088562012),
 ('output', 0.6554935574531555),
 ('producing', 0.6320029497146606),
 ('produc_tion', 0.623302698135376),
 ('pro_duction', 0.5794395208358765),
 ('producers', 0.5526612997055054),
 ('ouput', 0.5512783527374268),
 ('producton', 0.5293622016906738),
 ('produciton', 0.5268733501434326),
 ('Chapada_mine', 0.5146549344062805)]

In [10]:
wv.most_similar(positive=['profit'], topn=10)

[('profits', 0.8020025491714478),
 ('proft', 0.7426661252975464),
 ('pretax_profit', 0.6691470742225647),
 ('pretax_profits', 0.643925130367279),
 ('Profit', 0.6339422464370728),
 ('earnings', 0.6314975023269653),
 ('Profits', 0.5853961706161499),
 ('revenue', 0.5829346179962158),
 ('pretax', 0.5686764121055603),
 ('quarterly', 0.5639331936836243)]

In [11]:
wv.most_similar(positive=['acquisition'], topn=10)

[('acquisitions', 0.7727051973342896),
 ('acquistion', 0.7558104991912842),
 ('acquisiton', 0.7446470856666565),
 ('acqusition', 0.6910087466239929),
 ('aquisition', 0.6902487874031067),
 ('Acquisition', 0.6753405928611755),
 ('transaction', 0.66414874792099),
 ('merger', 0.6581048369407654),
 ('divestiture', 0.6535748243331909),
 ('acquired', 0.634006917476654)]

In [12]:
wv.most_similar(positive=['investment'], topn=10)

[('investments', 0.8098693490028381),
 ('investing', 0.7012243270874023),
 ('Investment', 0.6812148690223694),
 ('invesment', 0.6630898714065552),
 ('investor', 0.6319271326065063),
 ('invest', 0.6203395128250122),
 ('investors', 0.5946778655052185),
 ('equity', 0.5897506475448608),
 ('investement', 0.5615226626396179),
 ('Investments', 0.5592135787010193)]

In [13]:
wv.most_similar(positive=['job'], topn=10)

[('jobs', 0.6262935996055603),
 ('Job', 0.567025899887085),
 ('BrokeAss_Blog_Need', 0.5589310526847839),
 ('work', 0.5102996826171875),
 ('daunting_Platoni', 0.5058081150054932),
 ('employment', 0.49385008215904236),
 ('monster.com', 0.4883536100387573),
 ('thankless_job', 0.46077287197113037),
 ('rsum', 0.45291033387184143),
 ('temping', 0.4513639807701111)]

## GloVe

In [14]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'lecture3/glove.6B/glove.6B.100d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)

  _ = glove2word2vec(glove_file, tmp_file)


In [15]:
model.most_similar(positive=['production'], topn=10)

[('producing', 0.7758312821388245),
 ('output', 0.7606295943260193),
 ('produced', 0.755420446395874),
 ('producer', 0.7116324305534363),
 ('manufacturing', 0.7111999988555908),
 ('produce', 0.7051300406455994),
 ('producers', 0.695489764213562),
 ('product', 0.6899324059486389),
 ('sales', 0.6748071908950806),
 ('export', 0.6692141890525818)]

In [15]:
model.most_similar(positive=['profit'], topn=10)

[('profits', 0.9010801315307617),
 ('earnings', 0.8935472965240479),
 ('net', 0.811954140663147),
 ('revenue', 0.8099523186683655),
 ('sales', 0.7789346575737),
 ('pretax', 0.767246425151825),
 ('quarter', 0.7604365348815918),
 ('revenues', 0.7518145442008972),
 ('share', 0.7449917197227478),
 ('gains', 0.7411855459213257)]

In [16]:
model.most_similar(positive=['acquisition'], topn=10)

[('merger', 0.8050162196159363),
 ('purchase', 0.7910113334655762),
 ('transaction', 0.7655937075614929),
 ('acquisitions', 0.749650776386261),
 ('sale', 0.7254613041877747),
 ('venture', 0.7206994295120239),
 ('company', 0.7177446484565735),
 ('acquire', 0.699617326259613),
 ('takeover', 0.6969232559204102),
 ('stake', 0.6833579540252686)]

In [17]:
model.most_similar(positive=['investment'], topn=10)

[('investments', 0.8891236782073975),
 ('fund', 0.7843336462974548),
 ('asset', 0.7737514972686768),
 ('financial', 0.7671132683753967),
 ('firms', 0.7522855997085571),
 ('equity', 0.7512969970703125),
 ('funds', 0.7471414804458618),
 ('business', 0.7369351983070374),
 ('portfolio', 0.7345483303070068),
 ('sector', 0.7308830618858337)]

In [18]:
model.most_similar(positive=['job'], topn=10)

[('jobs', 0.7932984232902527),
 ('better', 0.7354214191436768),
 ('doing', 0.7352380156517029),
 ('working', 0.7307748198509216),
 ('work', 0.7281291484832764),
 ('hiring', 0.7160316705703735),
 ('good', 0.7088245153427124),
 ('done', 0.7070804834365845),
 ('going', 0.7031288146972656),
 ('now', 0.6979781985282898)]

## b

The simplest way to construct sentence embedding from word embedding is to average the
embedding of all words in a sentence, then you will get a 1D vector for each sentence. In this way,
the order of words does not matter. Here we provide some pairs of sentences, calculate the cosine
similarity of these sentences and give a brief discussion on whether these similarities are reasonable
or not. What are the disadvantages of averaging word vectors for the document representation that
you observe? Describe an idea to improve. Note: This is an open-ended question. Feel free to
propose new ideas.

### Average word embedding for a doc

In [19]:
from numpy import dot
from numpy.linalg import norm

In [20]:
def cos_sim(a, b):
    return dot(a, b)/(norm(a)*norm(b))


import numpy as np

def doc2vec(doc, wv):
    vecs = []
    for token in doc.split():
        try:
            vecs.append(wv[token])
        except KeyError:
            pass
    return np.mean(vecs, axis=0)

doc1 = 'This movie is not bad and I would say I do enjoy it.'
doc2 = 'This movie is bad and I would say I do not enjoy it.'

doc3 = 'David is a cricket player and a opening batsman.'
doc4 = 'Leo is a cricket player too. He is a batsman, baller and keeper.'

doc5 = 'I love horror movies.'
doc6 = 'Lights out is a horror movie.'

v1 = doc2vec(doc1, wv)
v2 = doc2vec(doc2, wv)
v3 = doc2vec(doc3, wv)
v4 = doc2vec(doc4, wv)
v5 = doc2vec(doc5, wv)
v6 = doc2vec(doc6, wv)


In [21]:
cos_sim(v1, v2)

1.0

In [22]:
cos_sim(v3, v4)

0.80156016

In [23]:
cos_sim(v5, v6)

0.55689824

## c

Use the above averaging word embedding method, and construct the feature matrix for the
dataset with GloVe and word2vec word vectors, respectively. Feed the text classifier with a feature
matrix, and re-run the text classification pipeline. Compare and analyze their performance.

In [52]:
import numpy as np

# Convert a document to a vector by averaging word vectors
def doc2vec(doc, wv):
    vecs = [wv[word] for word in doc.split() if word in wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)  # Returns a zero vector if no valid words are found

# Generate feature vectors using Word2Vec
X_word2vec = np.array([doc2vec(sentence, wv) for sentence in fpb["Headline"]])

# Generate feature vectors using GloVe
X_glove = np.array([doc2vec(sentence, model) for sentence in fpb["Headline"]])

# Extract labels
y = fpb["Sentiment"]

In [53]:
from sklearn.model_selection import train_test_split

# First, split into Train (80%) and Temp (20%)
X_train_w2v, X_temp_w2v, y_train, y_temp = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

# Then, split Temp (20%) into Validation (10%) and Test (10%)
X_val_w2v, X_test_w2v, y_val, y_test = train_test_split(X_temp_w2v, y_temp, test_size=0.5, random_state=42)

# Repeat for GloVe
X_train_glove, X_temp_glove, _, y_temp = train_test_split(X_glove, y, test_size=0.2, random_state=42)
X_val_glove, X_test_glove, _, _ = train_test_split(X_temp_glove, y_temp, test_size=0.5, random_state=42)

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define hyperparameter search space
param_grid = {"C": [7, 7.5, 8, 8.5, 9]}

# Initialize Logistic Regression model
logistic_model = LogisticRegression(max_iter=50000)

# Perform grid search using F1 Macro score
grid_search_w2v = GridSearchCV(logistic_model, param_grid, scoring="f1_macro", n_jobs=-1)
grid_search_glove = GridSearchCV(logistic_model, param_grid, scoring="f1_macro", n_jobs=-1)

# Fit models for Word2Vec and GloVe
grid_search_w2v.fit(X_train_w2v, y_train)
grid_search_glove.fit(X_train_glove, y_train)

# Get the best `C` value for each model
best_C_w2v = grid_search_w2v.best_params_["C"]
best_C_glove = grid_search_glove.best_params_["C"]

In [55]:
best_C_w2v

7

In [56]:
best_C_glove

9

In [57]:
# Train final model using best C for Word2Vec
model_w2v = LogisticRegression(C=best_C_w2v, max_iter=50000)
model_w2v.fit(X_train_w2v, y_train)

# Train final model using best C for GloVe
model_glove = LogisticRegression(C=best_C_glove, max_iter=50000)
model_glove.fit(X_train_glove, y_train)

In [58]:
# Make predictions using the best Word2Vec model
y_pred_w2v = model_w2v.predict(X_test_w2v)
y_pred_proba_w2v = model_w2v.predict_proba(X_test_w2v)

# Make predictions using the best GloVe model
y_pred_glove = model_glove.predict(X_test_glove)
y_pred_proba_glove = model_glove.predict_proba(X_test_glove)

In [61]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score

# Function to compute evaluation metrics
def evaluate(y_true, y_pred, y_proba):
    return {
        "ROC AUC": roc_auc_score(y_true, y_proba, multi_class="ovr"),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Micro-F1": f1_score(y_true, y_pred, average="micro")
    }

# Compute evaluation results
results_w2v = evaluate(y_test, y_pred_w2v, model_w2v.predict_proba(X_test_w2v))
results_glove = evaluate(y_test, y_pred_glove, model_glove.predict_proba(X_test_glove))

# Display results
results_df = pd.DataFrame([
    ["Word2Vec", results_w2v["ROC AUC"], results_w2v["Macro-F1"], results_w2v["Micro-F1"]],
    ["GloVe", results_glove["ROC AUC"], results_glove["Macro-F1"], results_glove["Micro-F1"]]
], columns=["Representation", "ROC AUC", "Macro-F1", "Micro-F1"])
results_df

Unnamed: 0,Representation,ROC AUC,Macro-F1,Micro-F1
0,Word2Vec,0.888229,0.770875,0.8
1,GloVe,0.847638,0.646962,0.717526


## 

# Problem 2: N-gram Language Models

## a

Here we provide a few sentences. You will use these to fit a unigram language model. Report
the P(w) for all possible words and punctuation. Note: You do not need to write code. This is
mostly a mathematics question.

In [62]:
corpus = [
    'How about some major emerging economies?',
    'Elephants are the largest living terrestrial animals in the world.',
    'Weather derivatives are tradable commodities that protect business owners from future changes in the weather.',
    'Most Sunday papers have comics, which children enjoy.',
    'Da Vinci was brilliant in multiple fields!'
]


In [63]:
def prepare_ngrams(corpus, n):
    # collect C(w_{i-n+1}, .. w_i)
    # collect C(w_{i-n+1}, .. w_{i-1})
    counter = dict()
    for sentence in corpus:
        counter[''] = counter.get('', 0) + 1
        tokens = sentence.split()
        tokens = ['<s>'] + tokens + ['</s>']
        # TODO: replace UNKNOWN tokens by <unk>
        for i, token in enumerate(tokens):
            combination = []
            for j in range(i, max(0, i - n + 1) - 1, -1):
                combination.append(tokens[j])
                key = ' '.join(combination)
                counter[key] = counter.get(key, 0) + 1
    return counter

counter = prepare_ngrams(corpus, 2)
counter

{'': 5,
 '<s>': 5,
 'How': 1,
 'How <s>': 1,
 'about': 1,
 'about How': 1,
 'some': 1,
 'some about': 1,
 'major': 1,
 'major some': 1,
 'emerging': 1,
 'emerging major': 1,
 'economies?': 1,
 'economies? emerging': 1,
 '</s>': 5,
 '</s> economies?': 1,
 'Elephants': 1,
 'Elephants <s>': 1,
 'are': 2,
 'are Elephants': 1,
 'the': 3,
 'the are': 1,
 'largest': 1,
 'largest the': 1,
 'living': 1,
 'living largest': 1,
 'terrestrial': 1,
 'terrestrial living': 1,
 'animals': 1,
 'animals terrestrial': 1,
 'in': 3,
 'in animals': 1,
 'the in': 2,
 'world.': 1,
 'world. the': 1,
 '</s> world.': 1,
 'Weather': 1,
 'Weather <s>': 1,
 'derivatives': 1,
 'derivatives Weather': 1,
 'are derivatives': 1,
 'tradable': 1,
 'tradable are': 1,
 'commodities': 1,
 'commodities tradable': 1,
 'that': 1,
 'that commodities': 1,
 'protect': 1,
 'protect that': 1,
 'business': 1,
 'business protect': 1,
 'owners': 1,
 'owners business': 1,
 'from': 1,
 'from owners': 1,
 'future': 1,
 'future from': 1,
 '

## b

Now you have a unigram language model trained from the corpus. Use the model to calculate
the probability of each following sentence.

In [65]:
def get_prob(counter, n, sentence):
    tokens = sentence.split()
    tokens = ['<s>'] + tokens + ['</s>']
    # TODO: replace UNKNOWN tokens by <unk>
    ret = 1
    for i, token in enumerate(tokens):
        comb = []
        for j in range(i - 1, max(0, i - n + 1) - 1, -1):
            comb.append(tokens[j])
        # condition: w_{i - 1} ... w_{i - n + 1}
        condition = ' '.join(comb)
        comb = [token] + comb
        full = ' '.join(comb)
        # P(token | ... )
        condition_count = counter.get(condition, 0)
        if condition_count == 0:
            prob = 0
        else:
            prob = counter.get(full, 0) / condition_count
        print(full, '|', condition, prob)
        ret *= prob
    return ret

In [66]:
doc7 = 'She’d quickly look through the Greek story'
doc8 = 'Older children don’t like being lectured at.'
doc9 = 'Intangible assets are of growing importance in the emerging knowledge-based economy.'

In [67]:
get_prob(counter, 2, doc7)

<s> |  1.0
She’d <s> | <s> 0.0
quickly She’d | She’d 0
look quickly | quickly 0
through look | look 0
the through | through 0
Greek the | the 0.0
story Greek | Greek 0
</s> story | story 0


0.0

In [68]:
get_prob(counter, 2, doc8)

<s> |  1.0
Older <s> | <s> 0.0
children Older | Older 0
don’t children | children 0.0
like don’t | don’t 0
being like | like 0
lectured being | being 0
at. lectured | lectured 0
</s> at. | at. 0


0.0

In [69]:
get_prob(counter, 2, doc9)

<s> |  1.0
Intangible <s> | <s> 0.0
assets Intangible | Intangible 0
are assets | assets 0
of are | are 0.0
growing of | of 0
importance growing | growing 0
in importance | importance 0
the in | in 0.6666666666666666
emerging the | the 0.0
knowledge-based emerging | emerging 0.0
economy. knowledge-based | knowledge-based 0
</s> economy. | economy. 0


0.0

## c

As you can observe, zero probability exists. How would you deal with zero probabilities? Are
they reasonable? Is there any way to overcome this issue? Make it an open-ended discussion.
Note: This is an open-ended question. Feel free to propose new ideas.