In [1]:
import os
import sys
import copy
import codecs
import gzip

import gensim
import smart_open
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import time
import datetime
from datetime import date
from datetime import datetime
from tqdm import tqdm

import collections
from typing import NamedTuple
import random
import json
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection
from sklearn import naive_bayes
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
import spacy
from nltk.tokenize import word_tokenize

from tensorflow import keras
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import GRU
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import LabelBinarizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


**Loading Labeled Data and Stop Words**

In [3]:
# Setup the path where labeled text files are located 
txtFilesPath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\txtFiles"

In [4]:
def extractLalebedFile(filePath):
    """Assumes a file path for a labeled news article saved as a .txt file, a string.
    Returns a dictionary with the news article data"""
    
    # Define regular expression to recognize labels in the labeled text files
    re1 = re.compile(r"^source")
    re2 = re.compile(r"^date")
    re3 = re.compile(r"^section")
    re4 = re.compile(r"^byline")
    re5 = re.compile(r"^inflationPosition")
    re6 = re.compile(r"^title")
    re7 = re.compile(r"^subtitle")
    re8 = re.compile(r"body")
    re9 = re.compile(r"end")
    
    # Open the file in the given path and read the lines in the .txt file
    with open(filePath,'r',encoding="utf8") as f:
        lines = f.readlines()

        textLines = [] # Initializes a list to store the lines of the body of the news article

        for indx in range(len(lines)):
            if bool(re1.search(lines[indx])):
                source = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the source of the article
            if bool(re2.search(lines[indx])):
                dateTxt = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the date of the publication...
                try:
                    datePublish = datetime.strptime(dateTxt, '%Y-%m-%d')             # ... and saves it as a datetime
                except:
                    print("Cannot cast date at ",filePath," article as datetime")
                    datePublish = np.nan
            if bool(re3.search(lines[indx])):
                paperSection = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the section of the publication
            if bool(re4.search(lines[indx])):
                byline = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the author name(s)
            if bool(re5.search(lines[indx])):
                inflationLabel = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the label given to the article
            if bool(re6.search(lines[indx])):
                currTitle = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the title of the news piece
            if bool(re7.search(lines[indx])):
                currSubtitle = re.split(r":",lines[indx])[1].strip().replace('"', '') # Extracts the subtitle of the news piece
            if bool(re8.search(lines[indx])):
                bodyStart = indx+1 # Recoords the line number where the body of the article starts
            if bool(re9.search(lines[indx])):
                bodyEnd = indx # Recoords the line number where the body of the article ends
                
        for indx in range(bodyStart,bodyEnd): # Extracts and format the main text of the article
            currLine = lines[indx].lower() # .lower() replaces capital letters with lower case letters
            currLine = currLine.translate(str.maketrans('', '', string.punctuation)) # To remove punctuation
            currLine = currLine.replace("\n", "") # To remove new line \n

            textLines.append(currLine)
            
        #articleInfo = (source,datePublish,paperSection,currTitle,currSubtitle,byline,inflationLabel)
        mainText = ' '.join(textLines) # Joins the lines of the article in a single string

        # Final touches to pre-process the main text of the article
        # To eliminate trailing or leading space

        if mainText[0].isspace():
            mainText = mainText[1:]

        if mainText[len(mainText)-1].isspace():
            mainText = mainText[:(len(mainText)-1)]
        
        #Builds the dictionary with the news article and the metadata associated with the article
        articleData = {'authors':byline,
                       'date_publish':datePublish,
                       'section':paperSection,
                       'publisher':source,
                       'title':currTitle,
                       'subtitle':currSubtitle,
                       'label':inflationLabel,
                       'maintext':mainText
                      }

        f.close()
        
        return(articleData)

In [5]:
def readLabeledFiles(directory):
    """Assumes a path to a directory where labeled news articles are saved in .txt format, a strig
    Returns a dictionary with all article data, the article data is a dict"""
    
    txtFilesPath =  directory
    listTxtFiles = os.listdir(txtFilesPath)
    
    labeledArticles = {}
    
    for indx in range(len(listTxtFiles)):
        currFilePath = os.path.join(txtFilesPath,listTxtFiles[indx]) # Creates the path for a specific file
        articleMetadata = extractLalebedFile(currFilePath) # Extracts the data from the labeled news article
        labeledArticles[indx] = articleMetadata # Adds article to dict
    
    return(labeledArticles)

In [6]:
# Read a set of stoplist words from filename, assuming it contains one word per line
# Return a python Set data structure (https://www.w3schools.com/python/python_sets.asp)
def load_stopwords(filename):
    stopwords = []
    with codecs.open(filename, 'r', encoding='ascii', errors='ignore') as fp:
        stopwords = fp.read().split('\n')
    return set(stopwords)

# Loading stopwords in order to get smaller vectors
stopwords_file = 'mallet_en_stoplist.txt'
stop_words = load_stopwords(stopwords_file)
stop_words = list(stop_words)

In [7]:
labeledNews = readLabeledFiles(txtFilesPath)

Cannot cast date at  C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\txtFiles\file18.txt  article as datetime


In [8]:
# I am creating this class in order to keep the labeled articls in an inmutable data structure
class labeledArticle(NamedTuple):
    """A class for news articles that have been labeled by a human"""
    articleID: int
    label: str
    body: str

In [9]:
# Creating instances of labeledArticle. The idea is to be able to shuffle and split the articles into 
# training set and test set while keeping track of the labels and article IDs
articlesSample = []
for key,value in labeledNews.items():
    articlesSample.append(labeledArticle(articleID=key,label=value.get('label'),body=value.get('maintext')))

**Loading GloVe pre-trained embedings**

In [10]:
glove_input_file = '.\glove.6B\glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

  This is separate from the ipykernel package so we can avoid doing imports until


(400000, 100)

In [81]:
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
gloveModel = KeyedVectors.load_word2vec_format(filename, binary=False)

**Preprocessing data**

In [15]:
X_train, X_test, y_train, y_test = train_test_split([articlesSample[x][2] for x in range(len(articlesSample))], 
                                                    [articlesSample[x][1] for x in range(len(articlesSample))], 
                                                    test_size=0.0000001)

In [17]:
X_tr = [articlesSample[x][2] for x in range(len(articlesSample))]
y_tr = [articlesSample[x][1] for x in range(len(articlesSample))]

In [20]:
# This function encodes the categorical labels
Encoder = LabelEncoder()
y_train_encoded = Encoder.fit_transform(y_tr)

In [21]:
# Convert integers to dummy variables. Keras expects "one hot" encoding for labels in multiclass problems
y_train_labels = to_categorical(y_train_encoded)

In [22]:
def prepareArticles(article,stopWords):
    punct = string.punctuation+'“'+'”'+'’'
    rawTokens = word_tokenize(article)
    tokens = [token for token in rawTokens if token not in punct]
    return [word for word in tokens if not word in stopWords]

In [53]:
X_train_clean = [prepareArticles(item,stop_words) for item in X_tr]
#X_test_clean = [prepareArticles(item,stop_words) for item in X_test]

**Encoding with gensim's Doc2Vec and predicting with cosine similarity**

Follows the Doc2Vec model tutorial at: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [24]:
def readNews(news,tokens_only=False):
    """Assumes a list with labeled news article data from a train-test split
    Returns doc2vec.TaggedDocument structure.
    Adapted from Doc2Vec Model Tutorial"""
    for i, article in enumerate(news):
        tokens = gensim.utils.simple_preprocess(article)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [54]:
train_corpus = list(readNews(X_tr))

In [55]:
# Creates the encoding model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [56]:
# Creates a vocabulary from the feed data
model.build_vocab(train_corpus)

In [57]:
# This will load the gensim model
start_time = time.time()
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.1649763584136963 seconds ---


In [102]:
def cosineSim(article):
    """Assumes an article
    Returns predicted label based in cosine similarity"""
    process_corpus = list(readNews([article], tokens_only=True))
    inferredVect = model.infer_vector(process_corpus[0])
    sims = model.dv.most_similar([inferredVect], topn=len(model.dv))
    y_pred_cosine = y_train_encoded[sims[0][0]]
    cosine_similarity = sims[0][1]
    most_sim = sims[0][0]
    return y_pred_cosine,cosine_similarity #,most_sim,cosine_similarity

In [69]:
# Checking the labels still are alingned with the labels
#sampleBody = X_tr[13]
#for i in range(len(articlesSample)):
#    if articlesSample[i][2] == sampleBody:
#        print(articlesSample[i])

**Setting up GRU Model**

In [70]:
tokenizer = Tokenizer(num_words=5000)

In [71]:
X_train_prepros = [' '.join(item) for item in X_train_clean]
#X_test_prepros = [' '.join(item) for item in X_test_clean]

In [72]:
tokenizer.fit_on_texts(X_train_prepros)
words_to_index = tokenizer.word_index

In [82]:
# This snippet of code creates a dictionary that maps the words to their respective word embeddings.
word_to_vec_map = {}
wordsNotInGlove = 0
for word in words_to_index.keys():
    try:
        word_to_vec_map[word] = gloveModel[word]
    except:
        wordsNotInGlove = wordsNotInGlove + 1

wordsNotInGlove

498

In [83]:
len(words_to_index)

4984

In [92]:
maxLen = 150

vocab_len = len(words_to_index)+1#-wordsNotInGlove
embed_vector_len = 100

# Initialize the embeding matrix
emb_matrix = np.zeros((vocab_len, embed_vector_len))

# Filling the embeding matrix. Words which are not in the GloVe dictionary being assigned a zero vector.
for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        emb_matrix[index] = embedding_vector
        #emb_matrix[index, :] = embedding_vector

In [93]:
# An Embedding layer should be fed sequences of integers, i.e. a 2D input of shape (samples, indices). 
# These input sequences should be padded so that they all have the same length in a batch of input data 
# (although an Embedding layer is capable of processing sequence of heterogenous length, if you don't 
# pass an explicit input_length argument to the layer).

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, 
                            input_length=maxLen, weights = [emb_matrix], trainable=True)

**Model architecture**

In [94]:
def gruArchitecture(input_shape):

    X_indices = Input(input_shape)

    embeddings = embedding_layer(X_indices) 

    gruLayer = GRU(128,activation='relu')(embeddings) # Removed return_sequences=False kwarg

    dropoutLayer = Dropout(0.1)(gruLayer) # Experiment with 0.2. Originally I used 0.6 too much few data
    
    dense_1 = Dense(128, activation='relu')(dropoutLayer)

    dense_2 = Dense(3, activation='softmax')(dense_1)

    model = Model(inputs=X_indices, outputs=dense_2) 

    return model

In [95]:
X_train_indices = tokenizer.texts_to_sequences(X_train_prepros)

In [96]:
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [97]:
#X_test_indices = tokenizer.texts_to_sequences(X_test_prepros)
#X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
#preds = lstmGloVe_model.predict(X_test_indices)

In [98]:
gruGloVe_model = gruArchitecture((maxLen,))
gruGloVe_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          498500    
_________________________________________________________________
gru (GRU)                    (None, 128)               87936     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 387       
Total params: 603,335
Trainable params: 603,335
Non-trainable params: 0
_________________________________________________________________


In [99]:
# Starting with smallest lr from LSTM. If in doubt I will test 0.00005
adam = keras.optimizers.Adam(lr = 0.00001) # 0.0001, 0.0002, 0.00001
gruGloVe_model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
gruGloVe_model.fit(X_train_indices, y_train_labels, batch_size=4, epochs=100)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1cd6563b160>

**Loading CommonCrawl news articles**

In [100]:
articlesPath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\cc_download_articles"

In [156]:
folderNames = ['chicago.suntimes.com',
 'cnnpressroom.blogs.cnn.com',
 'edition.cnn.com',
 'finance.yahoo.com',
 'foreignpolicy.com',
 'fortune.com',
 'm.startribune.com',
 'mashable.com',
 'nymag.com',
 'nypost.com',
 'people.com',
 'slate.com',
 'time.com',
 'www.aljazeera.com',
 'www.azcentral.com',
 'www.bbc.co.uk',
 'www.bbc.com',
 'www.bloomberg.com',
 'www.bostonglobe.com',
 'www.breitbart.com',
 'www.cbsnews.com',
 'www.chicagotribune.com',
 'www.cnbc.com',
 'www.cnn.com',
 'www.denverpost.com',
 'www.economist.com',
 'www.entrepreneur.com',
 'www.forbes.com',
 'www.foxbusiness.com',
 'www.foxnews.com',
 'www.ft.com',
 'www.huffpost.com',
 'www.inquirer.com',
 'www.latimes.com',
 'www.nationalreview.com',
 'www.nbcnews.com',
 'www.newsmax.com',
 'www.newyorker.com',
 'www.npr.org',
 'www.nytimes.com',
 'www.pbs.org',
 'www.politico.com',
 'www.reddit.com',
 'www.reuters.com',
 'www.seattletimes.com',
 'www.sfchronicle.com',
 'www.startribune.com',
 'www.stlouisgametime.com',
 'www.theatlantic.com',
 'www.theguardian.com',
 'www.theonion.com',
 'www.usatoday.com',
 'www.vox.com',
 'www.washingtonpost.com',
 'www.wired.com',
 'www.wsj.com']

In [157]:
# To get the date range from the crawled articles
basePath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\cc_download_articles"
articlesDates = []

for folder in tqdm(folderNames):
    currFolder = os.path.join(basePath,folder)
    filesInFolder = os.listdir(currFolder) 
    for file in filesInFolder:
        
        # Sets the full path of the current json file
        fullPath = os.path.join(currFolder,file)
        
        # Opening JSON file
        f = open(fullPath, encoding="utf8")
  
        # returns JSON object as a dictionary
        data = json.load(f)
        
        if data['language']=='en':
            # Extracts date of article
            dateArticle = data['date_publish']
            #titleArticle = data['title']
            dt = datetime.strptime(dateArticle, '%Y-%m-%d %H:%M:%S') 

            articlesDates.append(dt)
        
        # Closing file
        f.close()

100%|██████████| 56/56 [00:05<00:00, 10.74it/s]


In [112]:
# Without the 'en' filter the total articles is 14,219
print('Number of articles: ',len(articlesDates))
print('Date range from: ',min(articlesDates))
print('Date range ends in: ',max(articlesDates))

Number of articles:  13096
Date range from:  2020-01-04 00:00:00
Date range ends in:  2022-10-22 00:00:00


**Running classifiers on CommonCrawl news articles**

In [120]:
def tellMeAboutInflationCosine(news):
    """Assumes a json object
    Returns predicted label and cosine similarity based on labeled articles
    Uses cosine similarity model"""
    
    predLabel,cosSim = cosineSim(news['maintext'])
    
    return predLabel,cosSim

In [138]:
def preditInflationPlease(news):
    """Assumes a json object
    Returns predicted label and label probability using GRU RNN model"""
    
    X_test_clean = prepareArticles(news['maintext'],stop_words)
    X_test_prepros = [' '.join(X_test_clean)]

    X_test_indices = tokenizer.texts_to_sequences(X_test_prepros)
    X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

    predLabel = gruGloVe_model.predict(X_test_indices)
    
    maxProb = max(predLabel[0])
    index = np.argmax(predLabel[0])
    
    return index,maxProb

In [143]:
# Quick test of labeling methods

jsonPath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\cc_download_articles\www.theatlantic.com\a577a6e4235c97b6fd53641f79867b7899846addb640c08dd22807774812a74d.json"
# Opening JSON file
f = open(jsonPath, encoding="utf8")
  
# returns JSON object as a dictionary
data = json.load(f)

labels = {0:'Expect Inflation',1:'Inflation will fade away',2:'Neutral'}

# Label using cosine similarity
print('Predicted Label:')
print('Cosine Similarity: ',tellMeAboutInflationCosine(data),labels.get(tellMeAboutInflationCosine(data)[0]))
print('GRU RNN: ',preditInflationPlease(data),labels.get(preditInflationPlease(data)[0]))

# Closing file
f.close()

Predicted Label:
Cosine Similarity:  (2, 0.6490380764007568) Neutral
GRU RNN:  (0, 0.47798577) Expect Inflation


In [205]:
# Location of articles in 
basePath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\cc_download_articles"

def classifyArticles(locHDD,fldrNames):
    """Assumes the location of articles in HDD and a list of strings with selected folder names. 
    Files must be downloaded from Common Crawl and in json format.
    Returns a pandas dataframe with labels for articles and the stored in location"""
    
    labels = {0:'Expect Inflation',1:'Inflation will fade away',2:'Neutral'}
    
    predLabelsCosine = [] # List of tuples
    predLabelsGRU = [] # List of tuples
    folderName = []
    fileName = []
    publishDates = []
    bylines = []
    titles = []
    inflationWords = []
    
    errCos = 0
    errGru = 0
    
    for folder in tqdm(fldrNames):
        
        currFolder = os.path.join(locHDD,folder)
        filesInFolder = os.listdir(currFolder) 
        
        for file in filesInFolder:
            
            # Sets the full path of the current json file
            fullPath = os.path.join('cc_download_articles',currFolder,file)
            
            # Opening JSON file
            f = open(fullPath, encoding="utf8")

            # returns JSON object as a dictionary
            data = json.load(f)

            if data['language']=='en':
                
                ## ARTICLE METADATA EXTRACTION ##
                
                # Extracts date of article
                dateArticle = data['date_publish']
                #titleArticle = data['title']
                dt = datetime.strptime(dateArticle, '%Y-%m-%d %H:%M:%S') 
                publishDates.append(dt)
                
                # Article location data to check if an article needs closer inspection
                
                currFolder = folder
                currFile = file
                authorNames = data['authors']
                currTitle = data['title']
                
                folderName.append(currFolder)
                fileName.append(currFile)
                bylines.append(authorNames)
                titles.append(currTitle)
                
                ## ARTICLE LABEL ASSIGNMENT ##
                
                # currArticleBody = data['maintext'] the labeling functions should read currArticleBody already
                
                try: 
                    cosLab = tellMeAboutInflationCosine(data)
                except:
                    cosLab = (np.nan,np.nan)
                    errCos = errCos + 1
                
                try:
                    gruLab = preditInflationPlease(data)
                except:
                    gruLab = (np.nan,np.nan)
                    errGru = errGru +1 
                
                predLabelsCosine.append(cosLab)
                predLabelsGRU.append(gruLab)
                
                ## TEST WHETHER INFLATION IN IN THE BODY OF THE TEXT ##
                
                try:
                    inflationBool = 'inflation' in data['maintext']
                    inflationWords.append(inflationBool)
                except:
                    inflationBool = False
                    inflationWords.append(inflationBool)

            # Closing file
            f.close()
    
    labels_cosine = list(map(list, zip(*predLabelsCosine)))
    labels_prob = list(map(list, zip(*predLabelsGRU)))
    
    news_df = pd.DataFrame(list(zip(titles,
                                    labels_cosine[0],labels_cosine[1],
                                   labels_prob[0],labels_prob[1],inflationWords,
                                   publishDates,bylines,fileName,folderName)),
                          columns = ['Title','cosLabel','cosSimilarity','gruLabel','gruProb','inflationKW','publishDate',
                                     'byline','fileName','folderName'])
    
    print('Number of failed cosine similarity: ',errCos)
    print('Number of failed GRU RNN: ',errGru)
    
    return news_df

In [206]:
commonCrawlLabels = classifyArticles(basePath,folderNames)

100%|██████████| 56/56 [10:43<00:00, 11.49s/it]


Number of failed cosine similarity:  477
Number of failed GRU RNN:  477


In [207]:
commonCrawlLabels

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
0,Cecily Strong leaves ‘SNL’? Not just yet,2.0,0.854143,0.0,0.347478,False,2022-10-02 04:11:06,[],026686a614f171bc6c39285302e04e4bcbb40ed23422ba...,chicago.suntimes.com
1,"Bears WR Velus Jones active, will make NFL debut",2.0,0.783419,0.0,0.347478,False,2022-10-02 15:35:25,[],0b847e789cf155c73577c74bf6dd0d2bf9e84986f9059e...,chicago.suntimes.com
2,Jason Heyward gets another tribute from Cubs,2.0,0.649027,0.0,0.347478,False,2022-10-01 23:05:15,[],0ee0fb18ba8748cb6064f5769b3669bf7e62bb0739c301...,chicago.suntimes.com
3,Lane celebrates Fritz Pollard Day in honor of ...,2.0,0.633406,0.0,0.347478,False,2022-10-01 23:41:54,[],11482401f0686317522808fbeebcd763115dfb10b9dc09...,chicago.suntimes.com
4,"Bears’ offense wants wins, not stats, but gets...",0.0,0.750373,0.0,0.489643,False,2022-10-02 23:37:14,[],12b4aede0d0470fff347c2d82cb025a3650bad50660fbd...,chicago.suntimes.com
...,...,...,...,...,...,...,...,...,...,...
13091,Corrections & Amplifications,1.0,0.646779,0.0,0.347478,False,2022-10-01 00:12:00,[],fac0613d61cb56dc8f93c741a9bfa8ec4134f0422d28e3...,www.wsj.com
13092,Privately Insured Loss From Hurricane Ian Esti...,,,,,False,2022-09-29 00:00:00,[],fb80bf80d74fef1f0523cb9826030852ae563ae67a3b8c...,www.wsj.com
13093,Brazilian Election to Choose Between President...,2.0,0.643208,0.0,0.347478,False,2022-10-02 16:22:00,"[Samantha Pearson, Luciana Magalhaes]",fb82f681814ab681e1806d917beac77ccc9b9837ce65d9...,www.wsj.com
13094,The Next Big Battle Between Google and Apple I...,2.0,0.790457,0.0,0.347478,False,2022-10-01 04:00:00,[Christopher Mims],fde50bcd8047833411bc682dc8960d30d31e722159b4a0...,www.wsj.com


In [208]:
# Finding cases in which the cosine similarity classifier agreed with the GRU RNN classifier
commonCrawlLabels[commonCrawlLabels["cosLabel"] == commonCrawlLabels["gruLabel"]]

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
4,"Bears’ offense wants wins, not stats, but gets...",0.0,0.750373,0.0,0.489643,False,2022-10-02 23:37:14,[],12b4aede0d0470fff347c2d82cb025a3650bad50660fbd...,chicago.suntimes.com
5,Man shot during carjacking on Near West Side,0.0,0.696861,0.0,0.347478,False,2022-10-03 11:27:48,[],14f1bccf92fa40caaf42262eff532f7ce7b2ee6f201698...,chicago.suntimes.com
6,We must support immigrant children in Chicago ...,0.0,0.561907,0.0,0.347478,False,2022-10-02 22:52:00,[],18b88c3f06dda0f7e9567bc235a4c2dd0d708500c2425f...,chicago.suntimes.com
9,Many possibilities to consider if manager Tony...,0.0,0.703263,0.0,0.617973,False,2022-10-01 01:21:25,[],20563edc6ceaf7607cb27edf6df374de37ea386daac2a6...,chicago.suntimes.com
10,Medinah Temple casino won’t create River North...,0.0,0.707101,0.0,0.523499,False,2022-10-01 15:28:33,[],20b2fd74816319fdae526501e5df636fbba6f69bbd906f...,chicago.suntimes.com
...,...,...,...,...,...,...,...,...,...,...
13063,"With Risks Rising at Home, Putin Takes Anti-We...",0.0,0.680633,0.0,0.347478,False,2022-10-02 13:30:00,[Alan Cullison],c0dbcd9e82dfc328db8455230062dd0e75578542b3cae1...,www.wsj.com
13067,Credit Suisse Seeks to Calm Market Jitters,0.0,0.566161,0.0,0.347478,False,2022-10-02 16:59:00,[Margot Patrick],c6240bac3c845ebdbfc1a3017bfeb273b310dca85f95c6...,www.wsj.com
13068,Corrections & Amplifications,0.0,0.779228,0.0,0.347478,False,2022-10-02 21:52:00,[],cb6beca519c175f86e58b880fea11175835776fe214861...,www.wsj.com
13082,Battered Investors Now Find Thrills in T-Bills,0.0,0.658605,0.0,0.347478,False,2022-10-02 11:00:00,[Matt Grossman],e4d207ee11fe8f4f2d506f7a3f70162d027044b188871a...,www.wsj.com


In [210]:
commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True)]

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
45,"Health care bills rising, here’s why",0.0,0.715340,0.0,0.454654,True,2022-10-01 13:00:00,[],66d0838df0062b2b323c6f556559842759d513ec092078...,chicago.suntimes.com
129,Liz Truss admits mistakes on controversial tax...,2.0,0.514537,0.0,0.606543,True,2022-10-02 11:34:32,"[Mia Alberti Kara Fox, Mia Alberti, Kara Fox]",7362d17e6d97bbf60a97669a4fe0ff758e2a9f1ff9cc68...,edition.cnn.com
133,Bosnian election: Polls open in a race between...,1.0,0.634953,0.0,0.347478,True,2022-10-02 08:10:02,[],9668ccd8b5f0a24f2bc900f574d68dee12a6224a99a818...,edition.cnn.com
135,Liz Truss faces her party faithful after a dis...,0.0,0.636608,0.0,0.547662,True,2022-10-01 04:24:42,[Luke Mcgee],bb36469ea8d3be0e5cd8bcb429d4b100a33459f9e8d8c0...,edition.cnn.com
150,30-Year Mortgage Rate Spike In 2022 - Animated...,0.0,0.698408,0.0,0.347478,True,2022-10-01 05:42:14,[Julian Hebron],030b53a2e299183de8acd2e7ed3b7fdbbb6f343bd260d5...,finance.yahoo.com
...,...,...,...,...,...,...,...,...,...,...
12998,Supermarket Discounts Are Harder to Find as Fo...,0.0,0.763362,0.0,0.347478,True,2022-10-02 09:30:00,[Jaewon Kang],551ffd12bc87e4e867b5da89c23ef3276a4c2876b4cd69...,www.wsj.com
13000,Cargo Shipowners Cancel Sailings as Global Tra...,0.0,0.717299,0.0,0.347478,True,2022-10-02 13:00:00,[Costas Paris],57871d1e82b612ac6ccde0f3f4e724ad90900b07658519...,www.wsj.com
13029,Strong New-Car Demand Collides With Rising Int...,0.0,0.757889,0.0,0.347478,True,2022-10-03 09:30:00,[Mike Colias],800c4c6cf908320335743eeae558850e58882ad25eca80...,www.wsj.com
13048,S&P Lowers Outlook on U.K. Debt,0.0,0.660143,0.0,0.347478,True,2022-10-01 13:32:00,[Julie Steinberg],9d32762a07a3808d03e8811c7e2b3cdcfaf36f20e91490...,www.wsj.com


In [209]:
# Finding cases in which the cosine similarity classifier agreed with the GRU RNN classifier
commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True)&(commonCrawlLabels["cosLabel"] == commonCrawlLabels["gruLabel"])]

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
45,"Health care bills rising, here’s why",0.0,0.715340,0.0,0.454654,True,2022-10-01 13:00:00,[],66d0838df0062b2b323c6f556559842759d513ec092078...,chicago.suntimes.com
135,Liz Truss faces her party faithful after a dis...,0.0,0.636608,0.0,0.547662,True,2022-10-01 04:24:42,[Luke Mcgee],bb36469ea8d3be0e5cd8bcb429d4b100a33459f9e8d8c0...,edition.cnn.com
150,30-Year Mortgage Rate Spike In 2022 - Animated...,0.0,0.698408,0.0,0.347478,True,2022-10-01 05:42:14,[Julian Hebron],030b53a2e299183de8acd2e7ed3b7fdbbb6f343bd260d5...,finance.yahoo.com
163,Bonduelle - 21-22 Annual Results: Business gro...,0.0,0.395049,0.0,0.573444,True,2022-10-03 05:00:00,[],0c96763546cde34f07629a618db457d38155662deeb6dc...,finance.yahoo.com
167,"Taichung Commercial Bank Co., Ltd. (""TCB""), Ta...",0.0,0.319875,0.0,0.558800,True,2022-10-01 03:18:00,[],11f8c0b2e4cdaf887ff8c109b4efa38433aaff8b9faff1...,finance.yahoo.com
...,...,...,...,...,...,...,...,...,...,...
12985,"U.K.’s Central Banker Faces Inflation, a Finan...",0.0,0.679745,0.0,0.347478,True,2022-10-02 22:35:00,"[David Luhnow, Max Colchester]",40936e63f0d6520c74a8baa4d045de70197cdba45b9e39...,www.wsj.com
12998,Supermarket Discounts Are Harder to Find as Fo...,0.0,0.763362,0.0,0.347478,True,2022-10-02 09:30:00,[Jaewon Kang],551ffd12bc87e4e867b5da89c23ef3276a4c2876b4cd69...,www.wsj.com
13000,Cargo Shipowners Cancel Sailings as Global Tra...,0.0,0.717299,0.0,0.347478,True,2022-10-02 13:00:00,[Costas Paris],57871d1e82b612ac6ccde0f3f4e724ad90900b07658519...,www.wsj.com
13029,Strong New-Car Demand Collides With Rising Int...,0.0,0.757889,0.0,0.347478,True,2022-10-03 09:30:00,[Mike Colias],800c4c6cf908320335743eeae558850e58882ad25eca80...,www.wsj.com


In [187]:
# Proportion of agreement between classifiers
4714/13096

0.3599572388515577

In [211]:
# Proportion of agreement between classifiers that have the word inflation in the body of the article
228/517

0.4410058027079304

In [212]:
set(commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True)&(commonCrawlLabels["cosLabel"] == commonCrawlLabels["gruLabel"])].iloc[:,1])

{0.0}

In [213]:
set(commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True)&(commonCrawlLabels["cosLabel"] == commonCrawlLabels["gruLabel"])].iloc[:,3])

{0.0}

In [227]:
commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True) & 
                  (commonCrawlLabels["cosSimilarity"]>0.75)]

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
221,Ontario Lottery and Gaming Corporation - EVENI...,1.0,0.789295,0.0,0.347478,True,2022-10-01 05:49:00,[],4df9bf922cc96368755085ca56f6cd5a5185b93158f478...,finance.yahoo.com
231,Charting for investing success in Q4 2022,0.0,0.785759,0.0,0.347478,True,2022-10-03 11:00:41,[],549bdec731366bc957a224a5d5bb3e57adc8138a4965f1...,finance.yahoo.com
489,Sallie Krawcheck: ‘Women are in worse financia...,2.0,0.764352,0.0,0.532474,True,2022-10-03 12:12:00,[],bb93cf6c4bc53b0708e1c9bb400625fd15f3038019bbec...,fortune.com
493,Europe's ‘unprecedented risk’ of a gas shortag...,0.0,0.754525,0.0,0.543092,True,2022-10-03 11:31:49,[],d1e0292fdbac3d6aeaeb99c09776b03d6a0516b65ae483...,fortune.com
616,401(k) Adviser: It's a good time to review som...,0.0,0.759169,0.0,0.568635,True,2022-10-01 13:00:00,"[Special To The Star Tribune, October, - Am]",c8cff11d79f05fa246f0bc510768a3a4a3850041114740...,m.startribune.com
761,Westchester doctor seeks to topple ‘Squad’ mem...,2.0,0.758427,0.0,0.489677,True,2022-10-02 20:53:31,"[Social Links For Carl Campanile, Contact The ...",1dc1042f1427e365f0952e97508b5a753e1acb194410c8...,nypost.com
859,Dems' tellingly desperate smear of Marc Molinaro,2.0,0.770933,0.0,0.347478,True,2022-09-30 22:57:03,[],5968207c656693e63bde3ab79d102e803311851c3c70a3...,nypost.com
950,Biden’s ‘woke’ green energy makes us dependent...,2.0,0.80501,0.0,0.547572,True,2022-10-01 16:01:30,[Social Links For Michael Shellenberger],95bc3fb6a0fbcd56c917f64003469614a6ee2d2ca8d96c...,nypost.com
1374,Kari Lake or Katie Hobbs? Arizona's governor r...,2.0,0.796118,0.0,0.515861,True,2022-09-08 00:00:00,[],f2f692e0861d1be8e6b78eab5be19260bb45f2e0f4e20d...,www.azcentral.com
1469,PM Liz Truss' mixed news on benefits and pensions,2.0,0.786012,0.0,0.347478,True,2022-10-02 08:29:57,[],1613600f34a5c156febfc3ca999706a42b293bde5c3195...,www.bbc.co.uk


In [226]:
set(commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True) & 
                  (commonCrawlLabels["cosSimilarity"]>0.75)].iloc[:,1])

{0.0, 1.0, 2.0}

In [230]:
commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True) & 
                  (commonCrawlLabels["gruProb"]>0.60)]

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
129,Liz Truss admits mistakes on controversial tax...,2.0,0.514537,0.0,0.606543,True,2022-10-02 11:34:32,"[Mia Alberti Kara Fox, Mia Alberti, Kara Fox]",7362d17e6d97bbf60a97669a4fe0ff758e2a9f1ff9cc68...,edition.cnn.com
1240,Brazil Holds Historic Election With Lula Again...,2.0,0.608716,0.0,0.608409,True,2022-10-02 12:24:00,"[Diane Jeantet, Mauricio Savarese]",a8f424d1dfa68bb57be6974d328e7a8b3287411aacd0fb...,time.com
5312,Top Wall Street analysts see Alphabet as a buy...,0.0,0.573946,0.0,0.605792,True,2022-10-02 13:04:56,[Tipranks.Com Staff],2fc72280058e54a76cc7ddc2fe37d8012fb6bd7763a338...,www.cnbc.com
5350,Street investors' volatility rodeo,0.0,0.726431,0.0,0.637705,True,2022-10-02 12:00:01,[Yun Li],d2afe7ac34e53225da535caf174bc2f8020f750e64613a...,www.cnbc.com
5392,Liz Truss admits mistakes on controversial tax...,2.0,0.522887,0.0,0.606543,True,2022-10-02 11:34:32,"[Mia Alberti Kara Fox, Mia Alberti, Kara Fox]",325416b660cc506abf136d9db7e4a2a8ecdef9b03b6c1c...,www.cnn.com
7273,UK's Truss Sticks by Economic Plan as Her Part...,2.0,0.561108,0.0,0.603052,True,2022-10-02 06:00:34,[Jill Lawless],a610b1e13475fc5443b9021595922c6c976dcbbfe1e803...,www.newsmax.com
7314,Rep. Mike Turner: Expect 'Wild West' Spending ...,0.0,0.547273,0.0,0.60955,True,2022-10-02 16:28:05,[Sandy Fitzgerald],d127224cdbc230e62f9d2cdd386a2c96a74c012349a19f...,www.newsmax.com
7440,The hidden faces of hunger in America,0.0,0.589257,0.0,0.630872,True,2022-10-02 10:20:07,[Olivia Hampton],573b240a0ae597d80ceb2ced59d53701f72ebfa212b8fc...,www.npr.org
8691,UK’s Truss sticks by economic plan as her part...,2.0,0.567719,0.0,0.603051,True,2022-10-02 02:24:26,[The Associated Press],695fa99f695f8e59cc71eeadfc2b730f9661f4cc29574f...,www.seattletimes.com
8874,The great guinea pig giveaway has begun,0.0,0.593773,0.0,0.619077,True,2022-10-01 14:34:48,"[The New York Times, This Story Was Originally...",a693010b13493c1111c9d65422ebbfc7224516e53e01c3...,www.seattletimes.com


In [231]:
ccInflationArticles = commonCrawlLabels[(commonCrawlLabels["inflationKW"]==True)]
ccInflationArticles

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
45,"Health care bills rising, here’s why",0.0,0.715340,0.0,0.454654,True,2022-10-01 13:00:00,[],66d0838df0062b2b323c6f556559842759d513ec092078...,chicago.suntimes.com
129,Liz Truss admits mistakes on controversial tax...,2.0,0.514537,0.0,0.606543,True,2022-10-02 11:34:32,"[Mia Alberti Kara Fox, Mia Alberti, Kara Fox]",7362d17e6d97bbf60a97669a4fe0ff758e2a9f1ff9cc68...,edition.cnn.com
133,Bosnian election: Polls open in a race between...,1.0,0.634953,0.0,0.347478,True,2022-10-02 08:10:02,[],9668ccd8b5f0a24f2bc900f574d68dee12a6224a99a818...,edition.cnn.com
135,Liz Truss faces her party faithful after a dis...,0.0,0.636608,0.0,0.547662,True,2022-10-01 04:24:42,[Luke Mcgee],bb36469ea8d3be0e5cd8bcb429d4b100a33459f9e8d8c0...,edition.cnn.com
150,30-Year Mortgage Rate Spike In 2022 - Animated...,0.0,0.698408,0.0,0.347478,True,2022-10-01 05:42:14,[Julian Hebron],030b53a2e299183de8acd2e7ed3b7fdbbb6f343bd260d5...,finance.yahoo.com
...,...,...,...,...,...,...,...,...,...,...
12998,Supermarket Discounts Are Harder to Find as Fo...,0.0,0.763362,0.0,0.347478,True,2022-10-02 09:30:00,[Jaewon Kang],551ffd12bc87e4e867b5da89c23ef3276a4c2876b4cd69...,www.wsj.com
13000,Cargo Shipowners Cancel Sailings as Global Tra...,0.0,0.717299,0.0,0.347478,True,2022-10-02 13:00:00,[Costas Paris],57871d1e82b612ac6ccde0f3f4e724ad90900b07658519...,www.wsj.com
13029,Strong New-Car Demand Collides With Rising Int...,0.0,0.757889,0.0,0.347478,True,2022-10-03 09:30:00,[Mike Colias],800c4c6cf908320335743eeae558850e58882ad25eca80...,www.wsj.com
13048,S&P Lowers Outlook on U.K. Debt,0.0,0.660143,0.0,0.347478,True,2022-10-01 13:32:00,[Julie Steinberg],9d32762a07a3808d03e8811c7e2b3cdcfaf36f20e91490...,www.wsj.com


In [236]:
ccInflationArticles.sort_values(by=['cosSimilarity'],ascending=False).head(25)

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
4922,Analysis: First Colorado governor's debate foc...,2.0,0.845117,0.0,0.347478,True,2022-10-01 00:43:00,[],d38ca35182a9a247866b73cb5cd0ac3c1c1e25b1c7ed60...,www.cbsnews.com
8254,Malaysia's commodities minister says crude pal...,1.0,0.818895,0.0,0.347478,True,2022-10-03 03:58:22,[],cd660f6706459e8dae9d12701967068e84f238a5670693...,www.reuters.com
6653,A New Mexico Referendum Could Be A Model For I...,2.0,0.817504,0.0,0.522645,True,2022-10-01 16:53:02,[Senior Political Reporter],0f8717faf970713b6a734df96ae16c2537ff3b5caf71ad...,www.huffpost.com
3233,Mini-budget is supported by Welsh Secretary Ro...,2.0,0.815083,0.0,0.347478,True,2022-10-02 12:20:04,[],d49578870d079a83125abb4da8a4841e7327183ee88224...,www.bbc.com
4354,"Tyler Kistner says inflation, not abortion, ke...",2.0,0.811379,0.0,0.347478,True,2022-10-02 16:05:00,[],6c5a37ce9c6d54694cffb481d1fae0d5368fb83da6112a...,www.cbsnews.com
1912,Mini-budget is supported by Welsh Secretary Ro...,0.0,0.810128,0.0,0.347478,True,2022-10-02 12:20:04,[],804073cb9a28a475c45b705329b6edc5469b3db577f562...,www.bbc.co.uk
7584,"September 21, 2022 - PBS NewsHour full episode",1.0,0.808876,0.0,0.347478,True,2022-09-21 20:02:30,[],9f5db424209165030a8a8b8dbd468bd7753b7ff3c8a204...,www.pbs.org
8116,"Euro zone yields rise on inflation angst, gilt...",2.0,0.807292,0.0,0.495538,True,2022-10-03 08:50:39,[Stefano Rebaudo],704c7241c5e8200ba61b13397eb1a221bf752e3147d805...,www.reuters.com
2290,Wales' rail services hit as strike action resumes,2.0,0.805954,0.0,0.347478,True,2022-10-01 11:16:44,[],ce37cbb38d040c967a5d2b15f62e974aacf42cc3d3f0bb...,www.bbc.co.uk
11645,The US ultra-rich justify their low tax rates ...,2.0,0.805336,0.0,0.451006,True,2022-10-02 10:15:34,[Robert Reich],7690c198b7d5df903fc7f79f9312151899987233e2db4f...,www.theguardian.com


In [235]:
ccInflationArticles.sort_values(by=['gruProb'],ascending=False).head(25)

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
11817,Liz Truss and Kwasi Kwarteng’s foolish dash fo...,0.0,0.534552,0.0,0.638612,True,2022-10-02 06:00:30,[],c264c40d470f3f88b5709e109ff03ce938a9f68f23821b...,www.theguardian.com
5350,Street investors' volatility rodeo,0.0,0.726431,0.0,0.637705,True,2022-10-02 12:00:01,[Yun Li],d2afe7ac34e53225da535caf174bc2f8020f750e64613a...,www.cnbc.com
7440,The hidden faces of hunger in America,0.0,0.589257,0.0,0.630872,True,2022-10-02 10:20:07,[Olivia Hampton],573b240a0ae597d80ceb2ced59d53701f72ebfa212b8fc...,www.npr.org
8962,The great guinea pig giveaway has begun,0.0,0.596494,0.0,0.619077,True,2022-10-01 14:34:48,"[The New York Times, This Story Was Originally...",bfd6bde9f0fd1791c3f5eecdc17e66b8cf6486e1871d2f...,www.seattletimes.com
8874,The great guinea pig giveaway has begun,0.0,0.593773,0.0,0.619077,True,2022-10-01 14:34:48,"[The New York Times, This Story Was Originally...",a693010b13493c1111c9d65422ebbfc7224516e53e01c3...,www.seattletimes.com
7314,Rep. Mike Turner: Expect 'Wild West' Spending ...,0.0,0.547273,0.0,0.60955,True,2022-10-02 16:28:05,[Sandy Fitzgerald],d127224cdbc230e62f9d2cdd386a2c96a74c012349a19f...,www.newsmax.com
1240,Brazil Holds Historic Election With Lula Again...,2.0,0.608716,0.0,0.608409,True,2022-10-02 12:24:00,"[Diane Jeantet, Mauricio Savarese]",a8f424d1dfa68bb57be6974d328e7a8b3287411aacd0fb...,time.com
12059,Streaming is king: The state of TV with Kelly ...,0.0,0.587742,0.0,0.607227,True,2022-10-02 12:35:26,[],74ae04bb355925c1cf2f0b25454eb2040bbf81042b73e7...,www.usatoday.com
129,Liz Truss admits mistakes on controversial tax...,2.0,0.514537,0.0,0.606543,True,2022-10-02 11:34:32,"[Mia Alberti Kara Fox, Mia Alberti, Kara Fox]",7362d17e6d97bbf60a97669a4fe0ff758e2a9f1ff9cc68...,edition.cnn.com
5392,Liz Truss admits mistakes on controversial tax...,2.0,0.522887,0.0,0.606543,True,2022-10-02 11:34:32,"[Mia Alberti Kara Fox, Mia Alberti, Kara Fox]",325416b660cc506abf136d9db7e4a2a8ecdef9b03b6c1c...,www.cnn.com


In [265]:
ccInflationArticles[ccInflationArticles['cosLabel']==ccInflationArticles['gruLabel']].head(50)

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
45,"Health care bills rising, here’s why",0.0,0.71534,0.0,0.454654,True,2022-10-01 13:00:00,[],66d0838df0062b2b323c6f556559842759d513ec092078...,chicago.suntimes.com
135,Liz Truss faces her party faithful after a dis...,0.0,0.636608,0.0,0.547662,True,2022-10-01 04:24:42,[Luke Mcgee],bb36469ea8d3be0e5cd8bcb429d4b100a33459f9e8d8c0...,edition.cnn.com
150,30-Year Mortgage Rate Spike In 2022 - Animated...,0.0,0.698408,0.0,0.347478,True,2022-10-01 05:42:14,[Julian Hebron],030b53a2e299183de8acd2e7ed3b7fdbbb6f343bd260d5...,finance.yahoo.com
163,Bonduelle - 21-22 Annual Results: Business gro...,0.0,0.395049,0.0,0.573444,True,2022-10-03 05:00:00,[],0c96763546cde34f07629a618db457d38155662deeb6dc...,finance.yahoo.com
167,"Taichung Commercial Bank Co., Ltd. (""TCB""), Ta...",0.0,0.319875,0.0,0.5588,True,2022-10-01 03:18:00,[],11f8c0b2e4cdaf887ff8c109b4efa38433aaff8b9faff1...,finance.yahoo.com
181,Ontario Lottery and Gaming Corporation - EVENI...,0.0,0.730802,0.0,0.347478,True,2022-10-03 03:56:00,[],23aec37c234df828c65347eb9cbdbd656fde5bf4f958d9...,finance.yahoo.com
189,"Global food insecurity driven by war, climate,...",0.0,0.534618,0.0,0.487759,True,2022-10-01 12:25:06,[],2b9c77b89ed8e7a0f13f7fa0e2b8ae21c7a4621938e2c9...,finance.yahoo.com
231,Charting for investing success in Q4 2022,0.0,0.785759,0.0,0.347478,True,2022-10-03 11:00:41,[],549bdec731366bc957a224a5d5bb3e57adc8138a4965f1...,finance.yahoo.com
240,Wishpond Provides Corporate Update on the Comp...,0.0,0.52427,0.0,0.525907,True,2022-10-03 11:30:00,[],57e247f893ea8149bfb317379d809f4d10e25844ce75e5...,finance.yahoo.com
302,Cryptocurrency Investment Multiplatform from I...,0.0,0.540642,0.0,0.54872,True,2022-10-02 06:00:00,[Intelfin Global Group Ltd],84c200bd68a12d9c76d70a98fb6f3b090ddaf94934ca2f...,finance.yahoo.com


In [266]:
ccInflationArticles[(ccInflationArticles['cosLabel']==1)&(ccInflationArticles['gruLabel']==1)].head(50)

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName


In [268]:
set(ccInflationArticles[ccInflationArticles['cosLabel']==ccInflationArticles['gruLabel']].iloc[:,1])

{0.0}

In [245]:
# Review articles classified by cosine similarity
print(ccInflationArticles.loc[4922][0],'\n',
ccInflationArticles.loc[8254][0],'\n',
ccInflationArticles.loc[3233][0],'\n',
ccInflationArticles.loc[6635][0],'\n',
ccInflationArticles.loc[3698][0],'\n',
ccInflationArticles.loc[7265][0]     )

Analysis: First Colorado governor's debate focuses on crime & inflation, raises fashion issue 
 Malaysia's commodities minister says crude palm oil price weakness is temporary 
 Mini-budget is supported by Welsh Secretary Robert Buckland 
 Bond funds recover appeal after painful falls 
 Economy Looks Worse than Fed Would Like, Markets Will Struggle until Inflation's Controlled 
 Investors Expect No Peace in Stocks Until Bond Gyrations Subside


In [246]:
# Review articles classified by gru rnn
print(ccInflationArticles.loc[5350][0],'\n',
ccInflationArticles.loc[7440][0],'\n',
ccInflationArticles.loc[8962][0],'\n',
ccInflationArticles.loc[5312][0],'\n',
ccInflationArticles.loc[6134][0],'\n',
ccInflationArticles.loc[5830][0]     )

Street investors' volatility rodeo 
 The hidden faces of hunger in America 
 The great guinea pig giveaway has begun 
 Top Wall Street analysts see Alphabet as a buying opportunity 
 STOCK MARKET NEWS: Stocks mixed, oil, gas, cryptocurrency higher, diesel lower 
 My 2023 Forecast For High-Yield CEFs


In [269]:
classifiedHighScore = ccInflationArticles.loc[[4922,8254,3233,6635,3698,7265,5350,7440,8962,5312,6134,5830,
                                               45,189,327,417,423,467,3414]]
classifiedHighScore

Unnamed: 0,Title,cosLabel,cosSimilarity,gruLabel,gruProb,inflationKW,publishDate,byline,fileName,folderName
4922,Analysis: First Colorado governor's debate foc...,2.0,0.845117,0.0,0.347478,True,2022-10-01 00:43:00,[],d38ca35182a9a247866b73cb5cd0ac3c1c1e25b1c7ed60...,www.cbsnews.com
8254,Malaysia's commodities minister says crude pal...,1.0,0.818895,0.0,0.347478,True,2022-10-03 03:58:22,[],cd660f6706459e8dae9d12701967068e84f238a5670693...,www.reuters.com
3233,Mini-budget is supported by Welsh Secretary Ro...,2.0,0.815083,0.0,0.347478,True,2022-10-02 12:20:04,[],d49578870d079a83125abb4da8a4841e7327183ee88224...,www.bbc.com
6635,Bond funds recover appeal after painful falls,0.0,0.799856,1.0,0.430732,True,2022-10-03 04:00:55,[Chris Flood],e3c53772f52ea8d4460e47c28e50a5357e51acec0426fa...,www.ft.com
3698,"Economy Looks Worse than Fed Would Like, Marke...",1.0,0.791364,0.0,0.347478,True,2022-10-01 15:44:30,[Ian Hanchett],51758f45fac92dcc612eb4a059e1e1adcf38eadcef43e6...,www.breitbart.com
7265,Investors Expect No Peace in Stocks Until Bond...,2.0,0.779404,0.0,0.487817,True,2022-10-02 09:35:46,[David Randall],9ea26685ce90e7a800fab40bcdccf962f6dee4c772c77f...,www.newsmax.com
5350,Street investors' volatility rodeo,0.0,0.726431,0.0,0.637705,True,2022-10-02 12:00:01,[Yun Li],d2afe7ac34e53225da535caf174bc2f8020f750e64613a...,www.cnbc.com
7440,The hidden faces of hunger in America,0.0,0.589257,0.0,0.630872,True,2022-10-02 10:20:07,[Olivia Hampton],573b240a0ae597d80ceb2ced59d53701f72ebfa212b8fc...,www.npr.org
8962,The great guinea pig giveaway has begun,0.0,0.596494,0.0,0.619077,True,2022-10-01 14:34:48,"[The New York Times, This Story Was Originally...",bfd6bde9f0fd1791c3f5eecdc17e66b8cf6486e1871d2f...,www.seattletimes.com
5312,Top Wall Street analysts see Alphabet as a buy...,0.0,0.573946,0.0,0.605792,True,2022-10-02 13:04:56,[Tipranks.Com Staff],2fc72280058e54a76cc7ddc2fe37d8012fb6bd7763a338...,www.cnbc.com


In [270]:
# To get the date range from the crawled articles
basePath = r"C:\Users\rzamb\Desktop\Desktop\UMD\641_Natural_Language_Processing\finalProject\cc_download_articles"
labels = {0:'Expect Inflation',1:'Inflation will fade away',2:'Neutral'}

for i in range(len(classifiedHighScore)):
    currFolder = os.path.join(basePath,classifiedHighScore.iloc[i,9])
    
    # Sets the full path of the current json file
    fullPath = os.path.join(currFolder,classifiedHighScore.iloc[i,8])
    
    # Opening JSON file
    f = open(fullPath, encoding="utf8")
    
    # returns JSON object as a dictionary
    data = json.load(f)
    
    print('|','#'*38,'|')
    print(f'| Cosine Label: {labels.get(classifiedHighScore.iloc[i,1]):<25}|')
    print(f'| GRU Label   : {labels.get(classifiedHighScore.iloc[i,3]):<25}|')
    print('|','#'*38,'|')
    print(f'| Article Body:',' '*24,'|')
    print('|','#'*38,'|')
    print(classifiedHighScore.iloc[i,0])
    print(data['maintext'])
    print('|','#'*38,'|')
    print('\n')
    
    # Closing file
    f.close()

| ###################################### |
| Cosine Label: Neutral                  |
| GRU Label   : Expect Inflation         |
| ###################################### |
| Article Body:                          |
| ###################################### |
Analysis: First Colorado governor's debate focuses on crime & inflation, raises fashion issue
Analysis: First Colorado governor's debate focuses on crime & inflation, raises fashion issue CBS News Colorado Political Specialist Shaun Boyd debuted her new political show, "Left, Right, Center" with a discussion about the first gubernatorial debate for the November election.
| ###################################### |


| ###################################### |
| Cosine Label: Inflation will fade away |
| GRU Label   : Expect Inflation         |
| ###################################### |
| Article Body:                          |
| ###################################### |
Malaysia's commodities minister says crude palm oil price weaknes