How  does wordembeddings worK?<p>
We have followed the pipeline of wordembeddings using Spacy to better understand how to break down comments into tokens

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')
import os, codecs, string, random
import numpy as np
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline  

seed = 42
random.seed(seed)
np.random.seed(seed)

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

In [8]:
#import the dataset with all comments
data1=pd.read_csv('data1final.csv')

In [9]:
#download a english dictionary to make anlysis
nlp = spacy.load('en_core_web_sm')

#select only the comment column and the first 100 rows, BERT does it automatically here we want to understand how it works
P_comm = data1.iloc[:,[1]]
P_comms= P_comm.head(100)

## 1st Step "Tokenization"<p>
We split the comment in different meaninful parts which are called tokens

In [10]:
#tokenization segment the text in meaningful sentences
def token(comment) :
  doc= nlp(comment)
  return[token.text for token in doc]

In [11]:
# divide each row in tokens
P_comms['tokenization'] = P_comms["comments"].apply(lambda x: token(x))
P_comms.head()

Unnamed: 0,comments,tokenization
0,sofi 13m yolo earnings let,"[sofi, 13, m, yolo, earnings, let]"
1,update 35m 43m,"[update, 35, m, 43, m]"
2,mmm company “3m multinational conglomerate ope...,"[mmm, company, “, 3, m, multinational, conglom..."
3,call mom regional bank contagion continues,"[call, mom, regional, bank, contagion, continues]"
4,three year regardsbefore employed making money...,"[three, year, regardsbefore, employed, making,..."


## 2nd Step "Speech Tagging"<p>
Make prediction about which label has the higher likelihood in certain context

In [12]:
#speech tagging : recognize the function of the word
def pos_comm(comment) :
  doc= nlp(comment)
  return[(token.text, token.pos_) for token in doc]

In [13]:
P_comms["tok_pos"]= P_comms["comments"].apply(lambda x: pos_comm(x))
P_comms.head()

Unnamed: 0,comments,tokenization,tok_pos
0,sofi 13m yolo earnings let,"[sofi, 13, m, yolo, earnings, let]","[(sofi, PROPN), (13, NUM), (m, NOUN), (yolo, A..."
1,update 35m 43m,"[update, 35, m, 43, m]","[(update, VERB), (35, NUM), (m, NOUN), (43, NU..."
2,mmm company “3m multinational conglomerate ope...,"[mmm, company, “, 3, m, multinational, conglom...","[(mmm, PROPN), (company, PROPN), (“, PUNCT), (..."
3,call mom regional bank contagion continues,"[call, mom, regional, bank, contagion, continues]","[(call, VERB), (mom, NOUN), (regional, ADJ), (..."
4,three year regardsbefore employed making money...,"[three, year, regardsbefore, employed, making,...","[(three, NUM), (year, NOUN), (regardsbefore, N..."


In [14]:
#alternative and more precise way to label them
def tag_comm(comment) :
  doc= nlp(comment)
  return[(token.text, token.tag_) for token in doc]

In [15]:
P_comms["tok_tag"]= P_comms["comments"].apply(lambda x: tag_comm(x))
P_comms.head()

Unnamed: 0,comments,tokenization,tok_pos,tok_tag
0,sofi 13m yolo earnings let,"[sofi, 13, m, yolo, earnings, let]","[(sofi, PROPN), (13, NUM), (m, NOUN), (yolo, A...","[(sofi, NNP), (13, CD), (m, NN), (yolo, JJ), (..."
1,update 35m 43m,"[update, 35, m, 43, m]","[(update, VERB), (35, NUM), (m, NOUN), (43, NU...","[(update, VB), (35, CD), (m, NN), (43, CD), (m..."
2,mmm company “3m multinational conglomerate ope...,"[mmm, company, “, 3, m, multinational, conglom...","[(mmm, PROPN), (company, PROPN), (“, PUNCT), (...","[(mmm, NNP), (company, NNP), (“, ``), (3, CD),..."
3,call mom regional bank contagion continues,"[call, mom, regional, bank, contagion, continues]","[(call, VERB), (mom, NOUN), (regional, ADJ), (...","[(call, VB), (mom, NN), (regional, JJ), (bank,..."
4,three year regardsbefore employed making money...,"[three, year, regardsbefore, employed, making,...","[(three, NUM), (year, NOUN), (regardsbefore, N...","[(three, CD), (year, NN), (regardsbefore, NN),..."


## 3rd Step "Removing stop words" <p>
Check all stopwords

In [16]:
#recognize stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words:',list(spacy_stopwords)[:10])

Number of stop words: 326
First ten stop words: ['by', 'perhaps', 'he', 'just', 'eleven', 'every', 'whither', 'per', 'also', 'over']


remove stopwords

In [17]:
#identify and remove stopwords
def ident_stop_word(comment):
    doc = nlp(comment)
    stop_words = []
    for token in doc:
        if token.is_stop:
            stop_words.append(token.text)
    return stop_words

In [18]:
P_comms["stop"]= P_comms["comments"].apply(lambda x: ident_stop_word(x))
P_comms.head()

Unnamed: 0,comments,tokenization,tok_pos,tok_tag,stop
0,sofi 13m yolo earnings let,"[sofi, 13, m, yolo, earnings, let]","[(sofi, PROPN), (13, NUM), (m, NOUN), (yolo, A...","[(sofi, NNP), (13, CD), (m, NN), (yolo, JJ), (...",[]
1,update 35m 43m,"[update, 35, m, 43, m]","[(update, VERB), (35, NUM), (m, NOUN), (43, NU...","[(update, VB), (35, CD), (m, NN), (43, CD), (m...",[]
2,mmm company “3m multinational conglomerate ope...,"[mmm, company, “, 3, m, multinational, conglom...","[(mmm, PROPN), (company, PROPN), (“, PUNCT), (...","[(mmm, NNP), (company, NNP), (“, ``), (3, CD),...","[since, well, across, four, many, the, might, ..."
3,call mom regional bank contagion continues,"[call, mom, regional, bank, contagion, continues]","[(call, VERB), (mom, NOUN), (regional, ADJ), (...","[(call, VB), (mom, NN), (regional, JJ), (bank,...",[call]
4,three year regardsbefore employed making money...,"[three, year, regardsbefore, employed, making,...","[(three, NUM), (year, NOUN), (regardsbefore, N...","[(three, CD), (year, NN), (regardsbefore, NN),...","[three, others, well]"


## 4th Step "Lemmatization"
Lemmatization concept is similar to standardization, make each word appear in a standard version

In [19]:
#lemmatize words->from special to standard
def lemmatization(comment):
    doc = nlp(comment)
    lemmatized_tokens = []
    for token in doc:
        if token.text != token.lemma_:
            lemmatized_tokens.append((token.text, token.lemma_))
    return lemmatized_tokens

In [20]:
P_comms["lemma"] = P_comms["comments"].apply(lambda x: lemmatization(x))
P_comms.head()

Unnamed: 0,comments,tokenization,tok_pos,tok_tag,stop,lemma
0,sofi 13m yolo earnings let,"[sofi, 13, m, yolo, earnings, let]","[(sofi, PROPN), (13, NUM), (m, NOUN), (yolo, A...","[(sofi, NNP), (13, CD), (m, NN), (yolo, JJ), (...",[],"[(earnings, earning)]"
1,update 35m 43m,"[update, 35, m, 43, m]","[(update, VERB), (35, NUM), (m, NOUN), (43, NU...","[(update, VB), (35, CD), (m, NN), (43, CD), (m...",[],[]
2,mmm company “3m multinational conglomerate ope...,"[mmm, company, “, 3, m, multinational, conglom...","[(mmm, PROPN), (company, PROPN), (“, PUNCT), (...","[(mmm, NNP), (company, NNP), (“, ``), (3, CD),...","[since, well, across, four, many, the, might, ...","[(“, ""), (operated, operate), (known, know), (..."
3,call mom regional bank contagion continues,"[call, mom, regional, bank, contagion, continues]","[(call, VERB), (mom, NOUN), (regional, ADJ), (...","[(call, VB), (mom, NN), (regional, JJ), (bank,...",[call],"[(continues, continue)]"
4,three year regardsbefore employed making money...,"[three, year, regardsbefore, employed, making,...","[(three, NUM), (year, NOUN), (regardsbefore, N...","[(three, CD), (year, NN), (regardsbefore, NN),...","[three, others, well]","[(employed, employ), (making, make), (playing,..."


## 5th Step "Chunking"
Phrases are base on noun-phrases, meaning that each chunk must have a noun in

In [21]:
#chunking ->base noun phrase 
def noun(comment) :
  doc= nlp(comment)
  return[(token.text ) for token in doc.noun_chunks]

In [22]:
P_comms["noun_chunks"]= P_comms["comments"].apply(lambda x: noun(x))
P_comms.head()

Unnamed: 0,comments,tokenization,tok_pos,tok_tag,stop,lemma,noun_chunks
0,sofi 13m yolo earnings let,"[sofi, 13, m, yolo, earnings, let]","[(sofi, PROPN), (13, NUM), (m, NOUN), (yolo, A...","[(sofi, NNP), (13, CD), (m, NN), (yolo, JJ), (...",[],"[(earnings, earning)]",[m]
1,update 35m 43m,"[update, 35, m, 43, m]","[(update, VERB), (35, NUM), (m, NOUN), (43, NU...","[(update, VB), (35, CD), (m, NN), (43, CD), (m...",[],[],[]
2,mmm company “3m multinational conglomerate ope...,"[mmm, company, “, 3, m, multinational, conglom...","[(mmm, PROPN), (company, PROPN), (“, PUNCT), (...","[(mmm, NNP), (company, NNP), (“, ``), (3, CD),...","[since, well, across, four, many, the, might, ...","[(“, ""), (operated, operate), (known, know), (...","[mmm company, m multinational conglomerate, kn..."
3,call mom regional bank contagion continues,"[call, mom, regional, bank, contagion, continues]","[(call, VERB), (mom, NOUN), (regional, ADJ), (...","[(call, VB), (mom, NN), (regional, JJ), (bank,...",[call],"[(continues, continue)]",[mom regional bank contagion]
4,three year regardsbefore employed making money...,"[three, year, regardsbefore, employed, making,...","[(three, NUM), (year, NOUN), (regardsbefore, N...","[(three, CD), (year, NN), (regardsbefore, NN),...","[three, others, well]","[(employed, employ), (making, make), (playing,...","[three year regardsbefore, money life, amzn ot..."


## 8th Step "Dependency parsing"
The combination of a noun with an adjective is stronger thanjust the noun or the adjective 

## 9th Step "Counting word occurneces"<p>
It is important to have an idea about which words are the most popular


In [24]:
# extract the column values as a list
text = P_comms["comments"].tolist()

# Split the text into individual words
words = [word for comment in text for word in comment.split()]

# Count the occurrences of each word
word_freq = Counter(words)

# Retrieve the five most common tokens
common_words = word_freq.most_common(5)

print(common_words)

[('transaction', 41), ('market', 33), ('day', 31), ('today', 27), ('sale', 26)]
