## 641 - NLP
## Final Project

### Prepared By: Ricardo Zambrano
### Model 1: Bag of Words Encoding - Naive Bayes Classifier and Support Vector Machines

**Session Setup**

In [1]:
import os
import sys
import copy
import codecs
import gzip

import gensim
import smart_open
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import time
import datetime
from datetime import date
from datetime import datetime
from tqdm import tqdm

import collections
from typing import NamedTuple
import random
import json
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection
from sklearn import naive_bayes
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

**Loading Labeled Data and Stop Words**

In [2]:
# Setup the path where labeled text files are located 
txtFilesPath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\txtFiles"

In [3]:
def extractLalebedFile(filePath):
    """Assumes a file path for a labeled news article saved as a .txt file, a string.
    Returns a dictionary with the news article data"""
    
    # Define regular expression to recognize labels in the labeled text files
    re1 = re.compile(r"^source")
    re2 = re.compile(r"^date")
    re3 = re.compile(r"^section")
    re4 = re.compile(r"^byline")
    re5 = re.compile(r"^inflationPosition")
    re6 = re.compile(r"^title")
    re7 = re.compile(r"^subtitle")
    re8 = re.compile(r"body")
    re9 = re.compile(r"end")
    
    # Open the file in the given path and read the lines in the .txt file
    with open(filePath,'r',encoding="utf8") as f:
        lines = f.readlines()

        textLines = [] # Initializes a list to store the lines of the body of the news article

        for indx in range(len(lines)):
            if bool(re1.search(lines[indx])):
                source = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the source of the article
            if bool(re2.search(lines[indx])):
                dateTxt = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the date of the publication...
                try:
                    datePublish = datetime.strptime(dateTxt, '%Y-%m-%d')             # ... and saves it as a datetime
                except:
                    print("Cannot cast date at ",filePath," article as datetime")
                    datePublish = np.nan
            if bool(re3.search(lines[indx])):
                paperSection = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the section of the publication
            if bool(re4.search(lines[indx])):
                byline = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the author name(s)
            if bool(re5.search(lines[indx])):
                inflationLabel = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the label given to the article
            if bool(re6.search(lines[indx])):
                currTitle = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the title of the news piece
            if bool(re7.search(lines[indx])):
                currSubtitle = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the subtitle of the news piece
            if bool(re8.search(lines[indx])):
                bodyStart = indx+1 # Recoords the line number where the body of the article starts
            if bool(re9.search(lines[indx])):
                bodyEnd = indx # Recoords the line number where the body of the article ends
                
        for indx in range(bodyStart,bodyEnd): # Extracts and format the main text of the article
            currLine = lines[indx].lower() # .lower() replaces capital letters with lower case letters
            currLine = currLine.translate(str.maketrans('', '', string.punctuation)) # To remove punctuation
            currLine = currLine.replace("\n", "") # To remove new line \n

            textLines.append(currLine)
            
        #articleInfo = (source,datePublish,paperSection,currTitle,currSubtitle,byline,inflationLabel)
        mainText = ' '.join(textLines) # Joins the lines of the article in a single string

        # Final touches to pre-process the main text of the article
        # To eliminate trailing or leading space

        if mainText[0].isspace():
            mainText = mainText[1:]

        if mainText[len(mainText)-1].isspace():
            mainText = mainText[:(len(mainText)-1)]
        
        #Builds the dictionary with the news article and the metadata associated with the article
        articleData = {'authors':byline,
                       'date_publish':datePublish,
                       'section':paperSection,
                       'publisher':source,
                       'title':currTitle,
                       'subtitle':currSubtitle,
                       'label':inflationLabel,
                       'maintext':mainText
                      }

        f.close()
        
        return(articleData)

In [4]:
def readLabeledFiles(directory):
    """Assumes a path to a directory where labeled news articles are saved in .txt format, a strig
    Returns a dictionary with all article data, the article data is a dict"""
    
    txtFilesPath =  directory
    listTxtFiles = os.listdir(txtFilesPath)
    
    labeledArticles = {}
    
    for indx in range(len(listTxtFiles)):
        currFilePath = os.path.join(txtFilesPath,listTxtFiles[indx]) # Creates the path for a specific file
        articleMetadata = extractLalebedFile(currFilePath) # Extracts the data from the labeled news article
        labeledArticles[indx] = articleMetadata # Adds article to dict
    
    return(labeledArticles)

In [5]:
# Read a set of stoplist words from filename, assuming it contains one word per line
# Return a python Set data structure (https://www.w3schools.com/python/python_sets.asp)
def load_stopwords(filename):
    stopwords = []
    with codecs.open(filename, 'r', encoding='ascii', errors='ignore') as fp:
        stopwords = fp.read().split('\n')
    return set(stopwords)

# Loading stopwords in order to get smaller vectors
stopwords_file = 'mallet_en_stoplist.txt'
stop_words = load_stopwords(stopwords_file)
stop_words = list(stop_words)

In [6]:
labeledNews = readLabeledFiles(txtFilesPath)

Cannot cast date at  C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\txtFiles\file18.txt  article as datetime


In [7]:
# Check that the dict was loaded correctly
labeledNews[0]

{'authors': 'Ana Swanson',
 'date_publish': datetime.datetime(2022, 8, 25, 0, 0),
 'section': 'Section B; Column 0; Business/Financial Desk; Pg. 2',
 'publisher': 'The New York Times',
 'title': 'Consumer Demand Is Key To Rise in U.S.',
 'subtitle': '',
 'label': 'Expect Inflation',
 'maintext': 'research has found that americans spending during the pandemic accounted for about 60 of inflation from 201921  supply chain bottlenecks and labor shortages have been a major factor driving inflation in the united states though surging consumer demand ultimately did more to drive up prices in the last two years according to researchers at the federal reserve bank of new york the university of maryland and harvard university  in a blog post on wednesday julian di giovanni the head of climate risk studies in the new york feds research and statistics group summarized findings from a paper presented in june that found higher consumer demand for all types of products during the pandemic was respons

In [8]:
# I am creating this class in order to keep the labeled articls in an inmutable data structure
class labeledArticle(NamedTuple):
    """A class for news articles that have been labeled by a human"""
    articleID: int
    label: str
    body: str

In [9]:
# Creating instances of labeledArticle. The idea is to be able to shuffle and split the articles into 
# training set and test set while keeping track of the labels and article IDs
articlesSample = []
for key,value in labeledNews.items():
    articlesSample.append(labeledArticle(articleID=key,label=value.get('label'),body=value.get('maintext')))

In [10]:
articlesSample[:2]

[labeledArticle(articleID=0, label='Expect Inflation', body='research has found that americans spending during the pandemic accounted for about 60 of inflation from 201921  supply chain bottlenecks and labor shortages have been a major factor driving inflation in the united states though surging consumer demand ultimately did more to drive up prices in the last two years according to researchers at the federal reserve bank of new york the university of maryland and harvard university  in a blog post on wednesday julian di giovanni the head of climate risk studies in the new york feds research and statistics group summarized findings from a paper presented in june that found higher consumer demand for all types of products during the pandemic was responsible for roughly 60 percent of the inflation in the united states between 2019 and 2021  supply shocks  which include shortages of workers raw materials and shipping containers needed to produce and move goods globally  accounted for the

In [11]:
print('label: ',articlesSample[0][1])
print('body: ',articlesSample[0][2])

label:  Expect Inflation
body:  research has found that americans spending during the pandemic accounted for about 60 of inflation from 201921  supply chain bottlenecks and labor shortages have been a major factor driving inflation in the united states though surging consumer demand ultimately did more to drive up prices in the last two years according to researchers at the federal reserve bank of new york the university of maryland and harvard university  in a blog post on wednesday julian di giovanni the head of climate risk studies in the new york feds research and statistics group summarized findings from a paper presented in june that found higher consumer demand for all types of products during the pandemic was responsible for roughly 60 percent of the inflation in the united states between 2019 and 2021  supply shocks  which include shortages of workers raw materials and shipping containers needed to produce and move goods globally  accounted for the remaining 40 percent of infl

In [12]:
# Example of list of labels: [articlesSample[x][1] for x in range(len(articlesSample))]
# Example of list of article bodies: [articlesSample[x][2] for x in range(len(articlesSample))]

In [13]:
X_train, X_test, y_train, y_test = train_test_split([articlesSample[x][2] for x in range(len(articlesSample))], 
                                                    [articlesSample[x][1] for x in range(len(articlesSample))], 
                                                    test_size=0.2)

To make sure the train_test_split() function kept the order of body and labels

In [14]:
X_train[0]



In [15]:
y_train[0]

'Expect Inflation'

In [16]:
sampleBody = X_train[0]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])



**Encoding with BOW model the training set and the test set**

In [17]:
# Sets up the BOW using 2-gram, 3-gram, 4-gram. It filters stop words
vectorizer = CountVectorizer(ngram_range=(2, 4),stop_words = stop_words)

In [18]:
# tokenize and build vocab
vectorizer.fit(X_train)

CountVectorizer(ngram_range=(2, 4),
                stop_words=['say', '', 'goes', 'it', 'look', 'changes', 'not',
                            'now', 'rd', 'well', 'after', 'different', 'r',
                            'hardly', 'down', 'thanks', 'need', 'except',
                            'novel', 'regards', 'thereupon', 'thereafter',
                            'him', 'despite', 'all', 'exactly', 'through', 'we',
                            'his', 'much', ...])

In [19]:
# summarize
print(len(vectorizer.vocabulary_))

42650


In [20]:
# This function encodes the categorical labels
Encoder = LabelEncoder()
y_train_encoded = Encoder.fit_transform(y_train)
y_test_encoded = Encoder.fit_transform(y_test)

In [21]:
y_train_encoded

array([0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 2, 0, 1, 2, 0, 0, 1, 0, 0, 1, 1, 2,
       1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2, 1, 2, 2, 1], dtype=int64)

In [22]:
set(y_train)

{'Expect Inflation', 'Inflation will fade away', 'Neutral'}

In [23]:
set(y_test)

{'Expect Inflation', 'Inflation will fade away'}

In [24]:
X_train_BOW = vectorizer.transform(X_train)
X_test_BOW = vectorizer.transform(X_test)

In [25]:
X_train_BOW.shape

(40, 42650)

**Fitting a Naive Bayes Model**

In [26]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_BOW,y_train_encoded)

MultinomialNB()

In [27]:
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test_BOW)

In [28]:
print(predictions_NB)
print(y_test_encoded)

[0 1 0 1 0 0 0 0 0 0]
[1 0 1 1 0 0 0 0 0 0]


In [29]:
# Use accuracy_score function to get the accuracy
accuracy_bow_nb = accuracy_score(y_test_encoded,predictions_NB)
print("Naive Bayes Accuracy Score -> ",accuracy_bow_nb)

Naive Bayes Accuracy Score ->  0.7


In [30]:
recall_bow_nb = recall_score(y_test_encoded,predictions_NB, average='micro')
print("Naive Bayes Recall Score -> ", recall_bow_nb)

Naive Bayes Recall Score ->  0.7


In [31]:
precision_bow_nb = precision_score(y_test_encoded,predictions_NB, average='micro')
print("Naive Bayes Precision Score -> ", precision_bow_nb)

Naive Bayes Precision Score ->  0.7


In [33]:
target_names = ['Expect inflation', 'Inflation will go away', 'Neutral']
clsf_rep_nb = classification_report(y_test_encoded,predictions_NB)
print(clsf_rep_nb)

              precision    recall  f1-score   support

           0       0.75      0.86      0.80         7
           1       0.50      0.33      0.40         3

    accuracy                           0.70        10
   macro avg       0.62      0.60      0.60        10
weighted avg       0.68      0.70      0.68        10



**Fitting an SVM Model**

In [34]:
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_BOW,y_train_encoded)

SVC(gamma='auto', kernel='linear')

In [35]:
# predict the labels on test dataset
predictions_SVM = SVM.predict(X_test_BOW)

In [36]:
print(predictions_SVM)
print(y_test_encoded)

[0 2 0 0 0 0 0 2 2 0]
[1 0 1 1 0 0 0 0 0 0]


In [37]:
# Use accuracy_score function to get the accuracy
accuracy_bow_svm = accuracy_score(y_test_encoded,predictions_SVM)
print("SVM Accuracy Score -> ",accuracy_bow_svm)

SVM Accuracy Score ->  0.4


In [38]:
recall_bow_svm = recall_score(y_test_encoded,predictions_SVM, average='micro')
print("SVM Recall Score -> ", recall_bow_svm)

SVM Recall Score ->  0.4


In [39]:
precision_bow_svm = precision_score(y_test_encoded,predictions_SVM, average='micro')
print("SVM Precision Score -> ", precision_bow_svm)

SVM Precision Score ->  0.4


In [40]:
target_names = ['Expect inflation', 'Inflation will go away', 'Neutral']
clsf_rep_svm = classification_report(y_test_encoded,predictions_SVM, target_names=target_names)
print(clsf_rep_svm)

                        precision    recall  f1-score   support

      Expect inflation       0.57      0.57      0.57         7
Inflation will go away       0.00      0.00      0.00         3
               Neutral       0.00      0.00      0.00         0

              accuracy                           0.40        10
             macro avg       0.19      0.19      0.19        10
          weighted avg       0.40      0.40      0.40        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Encoding with TfIdf model the training set and the test set**

In [41]:
# Sets up the BOW using 2-gram, 3-gram, 4-gram. It filters stop words
vectorizer2 = TfidfVectorizer(ngram_range=(2, 4),stop_words = stop_words)

In [42]:
# tokenize and build vocab
vectorizer2.fit(X_train)

TfidfVectorizer(ngram_range=(2, 4),
                stop_words=['say', '', 'goes', 'it', 'look', 'changes', 'not',
                            'now', 'rd', 'well', 'after', 'different', 'r',
                            'hardly', 'down', 'thanks', 'need', 'except',
                            'novel', 'regards', 'thereupon', 'thereafter',
                            'him', 'despite', 'all', 'exactly', 'through', 'we',
                            'his', 'much', ...])

In [43]:
# summarize
print(len(vectorizer2.vocabulary_))

42650


In [44]:
X_train_Tfidf = vectorizer2.transform(X_train)
X_test_Tfidf = vectorizer2.transform(X_test)

In [45]:
X_train_Tfidf.shape

(40, 42650)

**Fitting a Naive Bayes Model for Tf-idf encoding**

In [46]:
# fit the training dataset on the NB classifier
Naive2 = naive_bayes.MultinomialNB()
Naive2.fit(X_train_Tfidf,y_train_encoded)

MultinomialNB()

In [47]:
# predict the labels on validation dataset
predictions_NB2 = Naive2.predict(X_test_Tfidf)

In [48]:
print(predictions_NB2)
print(y_test_encoded)

[0 0 0 0 0 0 0 0 0 0]
[1 0 1 1 0 0 0 0 0 0]


In [49]:
# Use accuracy_score function to get the accuracy
accuracy_Tfidf_nb = accuracy_score(y_test_encoded,predictions_NB2)
print("Naive Bayes Accuracy Score -> ",accuracy_Tfidf_nb)

Naive Bayes Accuracy Score ->  0.7


In [50]:
recall_Tfidf_nb = recall_score(y_test_encoded,predictions_NB2, average='micro')
print("Naive Bayes Recall Score -> ", recall_Tfidf_nb)

Naive Bayes Recall Score ->  0.7


In [51]:
precision_Tfidf_nb = precision_score(y_test_encoded,predictions_NB2, average='micro')
print("Naive Bayes Precision Score -> ", precision_Tfidf_nb)

Naive Bayes Precision Score ->  0.7


In [53]:
target_names = ['Expect inflation', 'Inflation will go away', 'Neutral']
clsf_rep_Tfidf_nb = classification_report(y_test_encoded,predictions_NB2)
print(clsf_rep_Tfidf_nb)

              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       0.00      0.00      0.00         3

    accuracy                           0.70        10
   macro avg       0.35      0.50      0.41        10
weighted avg       0.49      0.70      0.58        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Fitting an SVM Model for the TfIdf encoding**

In [54]:
# fit the training dataset on the classifier
SVM2 = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM2.fit(X_train_Tfidf,y_train_encoded)

SVC(gamma='auto', kernel='linear')

In [55]:
predictions_SVM2 = SVM2.predict(X_test_Tfidf)

In [56]:
print(predictions_SVM2)
print(y_test_encoded)

[0 0 0 0 0 0 0 0 0 0]
[1 0 1 1 0 0 0 0 0 0]


In [57]:
# Use accuracy_score function to get the accuracy
accuracy_Tfidf_svm = accuracy_score(y_test_encoded,predictions_SVM2)
print("Naive Bayes Accuracy Score -> ",accuracy_Tfidf_svm)

Naive Bayes Accuracy Score ->  0.7


In [58]:
recall_Tfidf_svm = recall_score(y_test_encoded,predictions_SVM2, average='micro')
print("Naive Bayes Recall Score -> ", recall_Tfidf_svm)

Naive Bayes Recall Score ->  0.7


In [59]:
precision_Tfidf_svm = precision_score(y_test_encoded,predictions_SVM2, average='micro')
print("Naive Bayes Precision Score -> ", precision_Tfidf_svm)

Naive Bayes Precision Score ->  0.7


In [61]:
target_names = ['Expect inflation', 'Inflation will go away', 'Neutral']
clsf_rep_Tfidf_svm = classification_report(y_test_encoded,predictions_SVM2)
print(clsf_rep_Tfidf_svm)

              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       0.00      0.00      0.00         3

    accuracy                           0.70        10
   macro avg       0.35      0.50      0.41        10
weighted avg       0.49      0.70      0.58        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Encoding with gensim's Doc2Vec and predicting with cosine similarity**

Follows the Doc2Vec model tutorial at: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [62]:
def readNews(news,tokens_only=False):
    """Assumes a list with labeled news article data from a train-test split
    Returns doc2vec.TaggedDocument structure.
    Adapted from Doc2Vec Model Tutorial"""
    for i, article in enumerate(news):
        tokens = gensim.utils.simple_preprocess(article)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [63]:
train_corpus = list(readNews(X_train))
test_corpus = list(readNews(X_test, tokens_only=True))

In [64]:
# Creates the encoding model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [65]:
# Creates a vocabulary from the feed data
model.build_vocab(train_corpus)

In [66]:
# Let's check how many times the word inflation apears on the labeled news dataset
print(f"Word 'inflation' appeared {model.wv.get_vecattr('inflation', 'count')} times in the training corpus.")

Word 'inflation' appeared 521 times in the training corpus.


In [67]:
# Let's check how many times the word transitory apears on the labeled news dataset
print(f"Word 'transitory' appeared {model.wv.get_vecattr('transitory', 'count')} times in the training corpus.")

Word 'transitory' appeared 4 times in the training corpus.


In [68]:
# This will load the gensim model
start_time = time.time()
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.8068060874938965 seconds ---


In [69]:
# Let's check the document embedings
vector = model.infer_vector("The unemployment rate is too low and this fuels inflation".split())
print(vector)

[-0.21959691 -0.16523121  0.03144318  0.31085932  0.1098144   0.04731079
  0.15995571 -0.00967111 -0.21459682  0.30741775 -0.20229432  0.16568355
  0.10838725  0.08592039 -0.16812086 -0.2548607   0.11954755  0.39605606
 -0.02387948  0.10477146  0.1258252   0.13008118 -0.22675449  0.10589869
 -0.0702756  -0.23844709 -0.3180193  -0.21744446 -0.16108482 -0.43238148
  0.12040122  0.11571792  0.11666974  0.09695289 -0.04072885  0.20876156
  0.02342697 -0.18610595  0.20022662  0.06176469  0.08886606 -0.15506977
  0.08286011 -0.09290649  0.16938923  0.21108295  0.02639672 -0.2749316
  0.11128721 -0.25525537]


**Assessing the Model**
To assess our new model, we’ll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we’re pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we’ve likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we’ll keep track of the second ranks for a comparison of less similar documents.

In [70]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [71]:
second_ranks[:5]

[(39, 0.5956165194511414),
 (7, 0.6362031102180481),
 (17, 0.5592270493507385),
 (3, 0.9748966693878174),
 (13, 0.6650760173797607)]

In [72]:
ranks[:5]

[0, 0, 0, 1, 0]

In [73]:
counter = collections.Counter(ranks)
print(counter)

Counter({0: 39, 1: 1})


Basically, greater than 100% of the inferred documents are found to be most similar to itself and about 0% of the time it is mistakenly most similar to another document. Checking the inferred-vector against a training-vector is a sort of ‘sanity check’ as to whether the model is behaving in a usefully consistent manner, though not a real ‘accuracy’ value.

In [74]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (39): «low inflation is indeed the problem of this era thus said john williams president of the federalreserve bank of new york in late espousing the dominant view at the time fast forward to the present and the problem is the exact opposite just about every country in the world has grappled with soaring prices in the situation is all but certain to improve in the coming year but at severe cost to economic growth what made so unusual was the breadth of price pressures the global rate of inflation will finish the year at roughly for many developing countries high inflation is recurrent challenge but the last time that inflation was so elevated in rich countries was the early in america consumer prices are on track to have risen by about in the highest in four decades in germany the rate will be closer to its first bout of doubledigit inflation since the common factors driving up inflation everywhere were soaring fuel and food costs prices for many consumer goods were already tr

We can see that document 39 is most similar to itself

In [75]:
# The label for document 39 is:
y_train[39]

'Inflation will fade away'

In [76]:
# The label for the second most similar document is:
y_train[12]

'Inflation will fade away'

In [77]:
# The label for the least similar document is
y_train[4]

'Inflation will fade away'

In [78]:
# Checking the labels still are alingned with the labels
sampleBody = X_train[39]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])

labeledArticle(articleID=11, label='Inflation will fade away', body='low inflation is indeed the problem of this era thus said john williams president of the federalreserve bank of new york in late 2019 espousing the dominant view at the time fast forward to the present and the problem is the exact opposite just about every country in the world has grappled with soaring prices in 2022 the situation is all but certain to improve in the coming year but at a severe cost to economic growth  what made 2022 so unusual was the breadth of price pressures the global rate of inflation will finish the year at roughly 9 for many developing countries high inflation is a recurrent challenge but the last time that inflation was so elevated in rich countries was the early 1980s in america consumer prices are on track to have risen by about 7 in 2022 the highest in four decades in germany the rate will be closer to 10 its first bout of doubledigit inflation since 1951  the common factors driving up inf

In [79]:
sampleBody = X_train[12]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])

labeledArticle(articleID=35, label='Inflation will fade away', body='the personal consumption expenditures index showed prices increased 55 percent last month as consumer spending pulled back  the federal reserves preferred inflation measure is showing signs of moderating after months of rapid price increases and a closely watched gauge of consumer spending slowed last month a sign that the economy may have less steam as it heads into 2023  the personal consumption expenditures price index climbed 55 percent in november from a year earlier a slowdown from 61 percent in the previous reading stripped of food and fuel costs which jump around a socalled core price measure climbed 47 percent down from 5 percent in the previous reading both figures were roughly in line with economist forecasts  although inflation is slowing it still has a long way to go to return to a more normal pace the fed has raised interest rates at the fastest clip in decades this year as it has tried to temper consume

In [80]:
# Infer Doc2Vec embedding from first article in pre-processed test_corpus
# X_test[0] == test_corpus[0]: Yes
inferredVect = model.infer_vector(test_corpus[0])

In [81]:
# Orderer list of articles from most similar to least similar in the train_corpus
sims = model.dv.most_similar([inferredVect], topn=len(model.dv))

In [82]:
# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(0, ' '.join(test_corpus[0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (0): «stubbornly high inflation is finally easing as supply chain disruptions fade and interest rate sat year highs put the brakes on demand now federal reserve officials have voiced unease that prices could reaccelerate because labor markets are so tight at issue is what the right way to forecast inflation bottomup analysis of recent readings on prices and wages that puts more weight on pandemicdriven idiosyncrasies or traditional topdown analysis of how far the economy is operating above or below its normal capacity some inside the fed including its influential staff put more weight on the latter which would argue for tighter policy for longer others prefer the former which could argue for milder approach the fed is likely to raise interest rates on wednesday by quarter percentage point to range between and slowing increases for the second consecutive meeting that would give officials more time to study the effects of earlier rate rises they are likely to debate how lon

In [83]:
# The label of the article at test_corpus[0] is:
print(y_test[0])
print(y_test_encoded[0])

Inflation will fade away
1


In [84]:
# The label of the most similar article is:
print(y_train[15])
print(y_train_encoded[15])

Expect Inflation
0


In [85]:
y_pred_cosine = []
y_actual_cosine = []
cosine_similarity = []
most_sim = []
for i, article in enumerate(test_corpus):
    inferredVect = model.infer_vector(test_corpus[i])
    sims = model.dv.most_similar([inferredVect], topn=len(model.dv))
    y_actual_cosine.append(y_test_encoded[i])
    y_pred_cosine.append(y_train_encoded[sims[0][0]])
    cosine_similarity.append(sims[0][1])
    most_sim.append(sims[0][0])

In [86]:
print(y_pred_cosine)
print(y_actual_cosine)

[1, 0, 0, 0, 2, 0, 0, 0, 0, 0]
[1, 0, 1, 1, 0, 0, 0, 0, 0, 0]


In [87]:
most_sim

[31, 15, 17, 14, 21, 11, 8, 0, 2, 2]

In [88]:
cosine_similarity

[0.8179842829704285,
 0.8322340846061707,
 0.6593027114868164,
 0.8612664937973022,
 0.7898670434951782,
 0.7953059673309326,
 0.6782040596008301,
 0.7412326335906982,
 0.618730366230011,
 0.7647850513458252]

In [89]:
# Another check to see the labels are alligned
sampleBody = X_train[25]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])

labeledArticle(articleID=8, label='Expect Inflation', body='investors have been celebrating recent us inflation data which showed both consumer and producer prices rising by less than expected in october but it wouldn’t be wise to get carried away by a single month’s numbers as markets ponder the federal reserve’s next move it’s too soon to be sure that the pace of monetary tightening can be safely scaled back  consumer prices rose 77 in the year through october down from 82 in september and the peak of 91 in june the new rate is the lowest since january before russia’s war on ukraine roiled global energy and commodity markets stripping out the cost of energy and food core cpi inflation is lower and also falling—to 63 from 66 in september the nov 10 report made investors more confident that the fed’s next hike in interest rates will be only 50 basis points ending its recent run of 75point increases  a similar story prevailed with data released on nov 15 the october producer price index

In [90]:
sampleBody = X_test[2]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])

labeledArticle(articleID=15, label='Inflation will fade away', body='we’ve been told for years that inflation has been too low now that it’s finally reached and surpassed the federal reserve’s 2 target it looks as scary as the fastgrowing carnivorous plant in little shop of horrors  iron copper lumber cotton computer chips and gasoline are jumping in price the dollar has weakened making imports more costly employers are having to raise wages to fill record openings the federal government is spending heavily and consumers emerging from the pandemic are in the mood to light some money on fire on may 12 the us bureau of labor statistics reported that consumer prices rose 08 in april from march four times the median expectation and the most since 2009 excluding food and energy the increase was 09 the most since 1982  stock indexes have retreated since may 7 when the sp 500 hit a record but jitters about the economic equivalent of an outofcontrol maneating houseplant are more clearly eviden

In [91]:
# Another check but in a wrong prediction
sampleBody = X_train[18]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])

labeledArticle(articleID=23, label='Expect Inflation', body='the days when worries over the danger of toohigh inflation were a regular feature of economic commentary are long gone that doesn’t mean the possibility of inflation running hotterthanexpected should be dismissed entirely  the labor department on tuesday reported that consumer prices rose 02 in december from november falling shy of the 03 economists surveyed by the wall street journal expected and putting them 23 above their yearearlier level prices excluding food and energy items—the socalled core economists watch to track inflation’s trend—rose 01 on the month putting them up 23 from a year earlier  since the federal reserve’s preferred inflation measure from the commerce department runs cooler than the labor department’s it is likely that inflation once again finished the year short of the central bank’s 2 target fed policy makers’ projections show that on balance they don’t expect inflation to reach 2 until next year  it 

In [92]:
sampleBody = X_test[6]
for i in range(len(articlesSample)):
    if articlesSample[i][2] == sampleBody:
        print(articlesSample[i])

labeledArticle(articleID=18, label='Expect Inflation', body='one of the most important signals of future inflation has begun to ease in the past month a development that should reassure the federal reserve in its prediction that the recent inflation surge will prove largely temporary  that signal is socalled inflation expectations what businesses consumers workers and investors expect inflation to be over the next one to 10 years because such expectations can be selffulfilling economists consider them key to where inflation is going  expectations are tracked through a range of surveys and marketbased measures and most are telling the same story after rising sharply from october through may they have now begun to ease  the median expectation of inflation during the next year for consumers surveyed by the university of michigan shot to 48 this month the highest since august 2008 however consumers’ oneyear expectations are strongly influenced by today’s inflation rate now a 13year high of

In [93]:
# Use accuracy_score function to get the accuracy
accuracy_Doc2Vec_cosine = accuracy_score(y_actual_cosine,y_pred_cosine)
print("Doc2Vec cosine similarity Accuracy Score -> ",accuracy_Doc2Vec_cosine)

Doc2Vec cosine similarity Accuracy Score ->  0.7


In [94]:
recall_Doc2vec_cosine = recall_score(y_actual_cosine,y_pred_cosine, average='micro')
print("Doc2Vec cosine similarity Recall Score -> ",recall_Doc2vec_cosine)

Doc2Vec cosine similarity Recall Score ->  0.7


In [95]:
precision_Doc2vec_cosine = precision_score(y_actual_cosine,y_pred_cosine, average='micro')
print("Doc2Vec cosine similarity Precision Score -> ", precision_Doc2vec_cosine)

Doc2Vec cosine similarity Precision Score ->  0.7


In [96]:
target_names = ['Expect inflation', 'Inflation will go away', 'Neutral']
clsf_rep_Doc2vec_cosine = classification_report(y_actual_cosine,y_pred_cosine, target_names=target_names)
print(clsf_rep_Doc2vec_cosine)

                        precision    recall  f1-score   support

      Expect inflation       0.75      0.86      0.80         7
Inflation will go away       1.00      0.33      0.50         3
               Neutral       0.00      0.00      0.00         0

              accuracy                           0.70        10
             macro avg       0.58      0.40      0.43        10
          weighted avg       0.82      0.70      0.71        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
print('BOW - Naive Bayes - Baseline')
print(clsf_rep_nb)
print('BOW - SVM')
print(clsf_rep_svm)
print('Tf-idf - Naive Bayes')
print(clsf_rep_Tfidf_nb)
print('Tf-idf - SVM')
print(clsf_rep_Tfidf_svm)
print('doc2vec - Cosine Similarity')
print(clsf_rep_Doc2vec_cosine)

BOW - Naive Bayes - Baseline
              precision    recall  f1-score   support

           0       0.75      0.86      0.80         7
           1       0.50      0.33      0.40         3

    accuracy                           0.70        10
   macro avg       0.62      0.60      0.60        10
weighted avg       0.68      0.70      0.68        10

BOW - SVM
                        precision    recall  f1-score   support

      Expect inflation       0.57      0.57      0.57         7
Inflation will go away       0.00      0.00      0.00         3
               Neutral       0.00      0.00      0.00         0

              accuracy                           0.40        10
             macro avg       0.19      0.19      0.19        10
          weighted avg       0.40      0.40      0.40        10

Tf-idf - Naive Bayes
              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       0.00      0.00      0.00         3