# Bing News API Real Time Data Collection

In [1]:
import json
import os
import requests
import time
from datetime import datetime

In [2]:
#pulls 10 articles from each of our 4 news categories from the bing news api
subscriptionKey = os.environ['BING_SEARCH_V7_SUBSCRIPTION_KEY']
endpoint = "https://api.bing.microsoft.com/v7.0/news"

# Construct a request
query = ""
categories = ["World", "Business", "Sports", "Science"]
count = 100
freshness = "Day"
mkt = 'en-US'
data_file_path = os.path.abspath(os.path.join(os.pardir,'data','bing_api_json'))

for category in categories:
    params = {'q': query, 'mkt': mkt, 'category': category, 'count': count, 'freshness': freshness}
    headers = {'Ocp-Apim-Subscription-Key': subscriptionKey}
    file_name = datetime.today().strftime('%Y%m%d') + "_" + category
    
    # Call the API
    try:
        response = requests.get(endpoint, headers=headers, params=params)
        response.raise_for_status()
        #write json string to file
        with open(os.path.join(data_file_path,file_name+'.json'), 'w') as json_file:
          json.dump(response.json(), json_file)
    except Exception as ex:
        raise ex

    time.sleep(1)
    #free account offers only 3 requests per second

# Real Time Data Cleaning

In [3]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import string
from num2word import word
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
import os

In [4]:
data_file_path = os.path.abspath(os.path.join(os.pardir,'data','bing_api_json',''))
mac = '/'
# windows = '\\'

#dict of json files from bing news api
dict = {'sports':[datetime.today().strftime('%Y%m%d')+'_Sports.json'],
        'world':[datetime.today().strftime('%Y%m%d')+'_World.json'],
        'business':[datetime.today().strftime('%Y%m%d')+'_Business.json'],
        'science_and_technology':[datetime.today().strftime('%Y%m%d')+'_Science.json']}

df_list = []

#iterates through each json file and stores as a dataframe in a list
for k, v in dict.items():
    for i in v:
        init_df = pd.read_json(data_file_path+mac+i)
        df = json_normalize(init_df['value'])
        df = df[['name','description']]
        df.insert(0, 'News Category', k)
        df_list.append(df)
        df.head()

#concatenates list of dataframes into one dataframe
data = pd.concat(df_list,axis=0)
#renames column title to match other data
data = data.rename(columns={'name':'Title', 'description': 'Description'})
data.shape
data.head()

#Below is Fengling's code unedited except adding comments
#Remove Punctuation and Stopwords
data['Title'] = data['Title'].str.translate(str.maketrans('','',string.punctuation)).str.lower()
data['Description'] = data['Description'].str.translate(str.maketrans('','',string.punctuation)).str.lower()


def convert_num_to_word(words):
    result = []
    for w in words:
        if w.isnumeric():
            result.extend(map(lambda x: x.lower(),word(w).split()))
        else:
            result.append(w)
    return result

data['Title'] = data['Title'].str.split().apply(convert_num_to_word)
data['Description'] = data['Description'].str.split().apply(convert_num_to_word)


def remove_stopword(words):
    result = []
    for word in words:
        if word not in STOPWORDS:
            result.append(word)
    return result

data['Title'] = data['Title'].apply(remove_stopword)
data['Description'] = data['Description'].apply(remove_stopword)


def remove_single_character(words):
    result = []
    for word in words:
        if len(word) > 1:
            result.append(word)
    return result

data['Title'] = data['Title'].apply(remove_single_character)
data['Description'] = data['Description'].apply(remove_single_character)



#Lemmatization
#this groups words based on their lemma ex: walk v walked v walking

def lemmatization(words):
    lemmatizer = WordNetLemmatizer()
    result = []
    for word in words:
        result.append(lemmatizer.lemmatize(word))
    return result

data['Title'] = data['Title'].apply(lemmatization)
data['Description'] = data['Description'].apply(lemmatization)

data.head()

Unnamed: 0,News Category,Title,Description
0,sports,"[wizard, complete, turnaround, rout, pacer, cl...","[six, week, falling, one, thousand, seven, hun..."
1,sports,"[falcon, rookie, kyle, pitt, invited, tight, e...","[falcon, rookie, tight, end, kyle, pitt, alrea..."
2,sports,"[albert, pujols, go, deep, first, home, run, d...","[solid, start, dodger, blue, 10time, allstar, ..."
3,sports,"[felipe, vazquez, former, pittsburgh, pirate, ...","[former, pittsburgh, pirate, pitcher, felipe, ..."
4,sports,"[epc, softball, champion, finally, it’s, freedom]","[thursday, night, freedom, played, first, soft..."


# Generating Textual Features

In [5]:
import pandas as pd
import re
import collections
import nltk
nltk.download()
from textblob import TextBlob
import os

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [6]:
data['Documents'] = data['Title'] + data['Description']
data.drop(['Title','Description'],axis=1,inplace=True)
data['Documents'] = data['Documents'].apply(lambda x: ' '.join(x))
data['Documents'] = data['Documents'].apply(lambda x: x.replace("\'","").replace(',','').replace(']','').replace('[',''))
data['Word Count'] = data['Documents'].apply(lambda x: len(re.findall(r'\w+', x)))

data['Noun Phrases'] = data['Documents'].apply(lambda x: len(TextBlob(x).noun_phrases))
data['Tags'] = data['Documents'].apply(lambda t: collections.Counter(tag for word,tag in TextBlob(t).tags))

# pos tag list https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/

data['Noun Count'] = data['Tags'].apply(lambda d: d.get('NN',0)+d.get('NNS',0)+d.get('NNP',0)+d.get('NNPS',0))
data['Adjective Count'] = data['Tags'].apply(lambda d: d.get('JJ',0)+d.get('JJR',0)+d.get('JJS',0))
data['Verb Count'] = data['Tags'].apply(lambda d: d.get('VB',0)+d.get('VBD',0)+d.get('VBG',0)+d.get('VBN',0)+d.get('VBP',0)+d.get('VBZ',0))
data.drop(['Tags'],axis=1,inplace=True)

In [7]:
data['Documents'] = data['Documents'].str.split(" ")
data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,sports,"[wizard, complete, turnaround, rout, pacer, cl...",28,3,12,5,3
1,sports,"[falcon, rookie, kyle, pitt, invited, tight, e...",30,7,19,5,4
2,sports,"[albert, pujols, go, deep, first, home, run, d...",24,5,12,4,5
3,sports,"[felipe, vazquez, former, pittsburgh, pirate, ...",27,6,14,7,4
4,sports,"[epc, softball, champion, finally, it’s, freed...",29,5,12,5,7


# Word2vec Feature Extraction

In [8]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from time import time 
import multiprocessing
import logging
#logger helped identify issues and troubleshoot errors
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

In [9]:
cores = multiprocessing.cpu_count()

# initializing word2vec model
#shallow neural network model
model = Word2Vec(min_count=2, #reduced min count for real time data
                     window=2, # window size for context 
                     vector_size=100,  # no of features 
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

INFO - 22:21:39: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-05-20T22:21:39.165692', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'created'}


In [10]:
t = time()

# build vocabulary  and learns word associations in text
model.build_vocab(data['Documents'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:21:39: collecting all words and their counts
INFO - 22:21:39: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:21:39: collected 864 word types from a corpus of 1320 raw words and 48 sentences
INFO - 22:21:39: Creating a fresh vocabulary
INFO - 22:21:39: Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 260 unique words (30.09259259259259%% of original 864, drops 604)', 'datetime': '2021-05-20T22:21:39.200238', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 22:21:39: Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 716 word corpus (54.24242424242424%% of original 1320, drops 604)', 'datetime': '2021-05-20T22:21:39.200899', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', '

Time to build vocab: 0.0 mins


In [11]:
t = time()

#train word2vec model 
model.train(data['Documents'], total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:21:39: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 260 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-05-20T22:21:39.218344', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'train'}
INFO - 22:21:39: worker thread finished; awaiting finish of 6 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 5 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 4 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 0 more threads
INFO - 22:21:39: EPOCH - 1 : training on 1320 raw words (92 effective words) took 0.0s,

INFO - 22:21:39: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 0 more threads
INFO - 22:21:39: EPOCH - 13 : training on 1320 raw words (108 effective words) took 0.0s, 16434 effective words/s
INFO - 22:21:39: worker thread finished; awaiting finish of 6 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 5 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 4 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 0 more threads
INFO - 22:21:39: EPOCH - 14 : training on 1320 raw words (93 

INFO - 22:21:39: worker thread finished; awaiting finish of 4 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 0 more threads
INFO - 22:21:39: EPOCH - 26 : training on 1320 raw words (104 effective words) took 0.0s, 13242 effective words/s
INFO - 22:21:39: worker thread finished; awaiting finish of 6 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 5 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 4 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 3 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 2 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 1 more threads
INFO - 22:21:39: worker thread finished; awaiting finish of 0

Time to train the model: 0.01 mins


In [12]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(words, model, num_features):
    """
    Calculate average feature vectors for all headlines 
    """
    counter = 0
    feature_vecs = np.zeros((len(words),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for word in words:
        feature_vecs[counter] = make_feature_vec(word, model, num_features)
        counter = counter + 1
    return feature_vecs

In [13]:
#creates the features by calculating average of the word vectors
word2vec = get_avg_feature_vecs(data['Documents'], model, 100)

In [14]:
w2v = pd.DataFrame(word2vec)
data.reset_index(drop=True, inplace=True)
w2v.reset_index(drop=True, inplace=True)
#df = pd.concat([df1, df2], axis=1)
w2v = pd.concat([data[['News Category','Word Count','Noun Phrases','Noun Count',
                                         'Adjective Count','Verb Count']],w2v],axis=1)

# remove instances in test set that could not be represented as feature vectors
w2v.dropna(inplace=True)
w2v.head()

Unnamed: 0,News Category,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,sports,28,3,12,5,3,-0.0009297017,-0.001585,-0.000862,0.002884,...,0.003431,0.004484,0.000868,0.000514,0.003661,0.001792,0.002909,0.000749,0.002014,0.002302
1,sports,30,7,19,5,4,-0.000227105,-0.0012,-0.001793,0.001442,...,0.005584,0.002658,-0.000201,0.001161,0.00484,0.005009,0.001653,-0.006047,0.001283,0.001741
2,sports,24,5,12,4,5,-0.002129143,0.001486,-0.000751,-0.00034,...,0.003299,0.002329,0.000945,-0.002247,0.002309,0.000604,0.003786,-0.003625,0.000372,0.000647
3,sports,27,6,14,7,4,0.001183281,0.001479,0.000521,-0.001686,...,0.003005,0.001856,0.001275,0.000857,0.003049,0.002238,3.5e-05,-0.001867,0.000474,0.003454
4,sports,29,5,12,5,7,2.657529e-07,0.002532,0.002676,0.003965,...,0.002085,0.00096,0.00143,0.000224,0.006805,0.002485,0.000762,0.000269,0.001942,0.000379


# Import Logistic Regression Model

In [15]:
import pickle

In [16]:
Pkl_Filename = "Pickle_RL_Model.pkl"  

with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

LogisticRegression(solver='newton-cg')

# Run LR Model on Real-Time Data with W2V Features

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder

In [18]:
# X dataframe 
X_test = w2v.drop(['News Category'],axis=1) 
# y series
y_test = w2v['News Category']

In [19]:
y_test = y_test.to_numpy()
y_pred = Pickled_LR_Model.predict(X_test)

lr_acc = accuracy_score(y_test,y_pred)
lr_recall = recall_score(y_test,y_pred,average='macro')
lr_precision = precision_score(y_test,y_pred,average='macro')
lr_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit(y_test.reshape(-1, 1)).transform(y_pred.reshape(-1,1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
lr_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

print("Logistic Regression: ",lr_acc,lr_recall,lr_precision,lr_f1,lr_roc)

Logistic Regression:  0.25 0.25 0.0625 0.1 0.5
