# Bing News API Real Time Data Collection

In [1]:
import json
import os
import requests
import time
from datetime import datetime

In [2]:
#pulls 10 articles from each of our 4 news categories from the bing news api
subscriptionKey = os.environ['BING_SEARCH_V7_SUBSCRIPTION_KEY']
endpoint = "https://api.bing.microsoft.com/v7.0/news"

# Construct a request
query = ""
categories = ["World", "Business", "Sports", "Science"]
count = 100
freshness = "Day"
mkt = 'en-US'
data_file_path = os.path.abspath(os.path.join(os.pardir,'data','bing_api_json'))

for category in categories:
    params = {'q': query, 'mkt': mkt, 'category': category, 'count': count, 'freshness': freshness}
    headers = {'Ocp-Apim-Subscription-Key': subscriptionKey}
    file_name = datetime.today().strftime('%Y%m%d') + "_" + category
    
    # Call the API
    try:
        response = requests.get(endpoint, headers=headers, params=params)
        response.raise_for_status()
        #write json string to file
        with open(os.path.join(data_file_path,file_name+'.json'), 'w') as json_file:
          json.dump(response.json(), json_file)
    except Exception as ex:
        raise ex

    time.sleep(1)
    #free account offers only 3 requests per second

# Real Time Data Cleaning

In [3]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import string
from num2word import word
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
import os

In [4]:
data_file_path = os.path.abspath(os.path.join(os.pardir,'data','bing_api_json',''))
mac = '/'
# windows = '\\'

#dict of json files from bing news api
dict = {'sports':[datetime.today().strftime('%Y%m%d')+'_Sports.json'],
        'world':[datetime.today().strftime('%Y%m%d')+'_World.json'],
        'business':[datetime.today().strftime('%Y%m%d')+'_Business.json'],
        'science_and_technology':[datetime.today().strftime('%Y%m%d')+'_Science.json']}

df_list = []

#iterates through each json file and stores as a dataframe in a list
for k, v in dict.items():
    for i in v:
        init_df = pd.read_json(data_file_path+mac+i)
        df = json_normalize(init_df['value'])
        df = df[['name','description']]
        df.insert(0, 'News Category', k)
        df_list.append(df)
        df.head()

#concatenates list of dataframes into one dataframe
data = pd.concat(df_list,axis=0)
#renames column title to match other data
data = data.rename(columns={'name':'Title', 'description': 'Description'})
data.shape
data.head()

#Below is Fengling's code unedited except adding comments
#Remove Punctuation and Stopwords
data['Title'] = data['Title'].str.translate(str.maketrans('','',string.punctuation)).str.lower()
data['Description'] = data['Description'].str.translate(str.maketrans('','',string.punctuation)).str.lower()


def convert_num_to_word(words):
    result = []
    for w in words:
        if w.isnumeric():
            result.extend(map(lambda x: x.lower(),word(w).split()))
        else:
            result.append(w)
    return result

data['Title'] = data['Title'].str.split().apply(convert_num_to_word)
data['Description'] = data['Description'].str.split().apply(convert_num_to_word)


def remove_stopword(words):
    result = []
    for word in words:
        if word not in STOPWORDS:
            result.append(word)
    return result

data['Title'] = data['Title'].apply(remove_stopword)
data['Description'] = data['Description'].apply(remove_stopword)


def remove_single_character(words):
    result = []
    for word in words:
        if len(word) > 1:
            result.append(word)
    return result

data['Title'] = data['Title'].apply(remove_single_character)
data['Description'] = data['Description'].apply(remove_single_character)



#Lemmatization
#this groups words based on their lemma ex: walk v walked v walking

def lemmatization(words):
    lemmatizer = WordNetLemmatizer()
    result = []
    for word in words:
        result.append(lemmatizer.lemmatize(word))
    return result

data['Title'] = data['Title'].apply(lemmatization)
data['Description'] = data['Description'].apply(lemmatization)

data.head()

Unnamed: 0,News Category,Title,Description
0,sports,"[dana, white, doesnt, care, jon, jones, fight,...","[ufc, president, dana, white, continues, stand..."
1,sports,"[tennessee, titan, offseason, roster, among, o...","[tennessee, titan, offseason, roster, will, se..."
2,sports,"[giant, open, offense, rookie, receiver, kadar...","[east, rutherford, new, jersey, usa, new, york..."
3,sports,"[grizzly, v, warrior, betting, pick, predictio...","[sunday, golden, state, warrior, memphis, griz..."
4,sports,"[chris, webber, elected, naismith, memorial, h...","[saturday, first, wolverine, enshrined, spring..."


# Generating Textual Features

In [5]:
import pandas as pd
import re
import collections
import nltk
nltk.download()
from textblob import TextBlob
import os

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [6]:
data['Documents'] = data['Title'] + data['Description']
data.drop(['Title','Description'],axis=1,inplace=True)
data['Documents'] = data['Documents'].apply(lambda x: ' '.join(x))
data['Documents'] = data['Documents'].apply(lambda x: x.replace("\'","").replace(',','').replace(']','').replace('[',''))
data['Word Count'] = data['Documents'].apply(lambda x: len(re.findall(r'\w+', x)))

data['Noun Phrases'] = data['Documents'].apply(lambda x: len(TextBlob(x).noun_phrases))
data['Tags'] = data['Documents'].apply(lambda t: collections.Counter(tag for word,tag in TextBlob(t).tags))

# pos tag list https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/

data['Noun Count'] = data['Tags'].apply(lambda d: d.get('NN',0)+d.get('NNS',0)+d.get('NNP',0)+d.get('NNPS',0))
data['Adjective Count'] = data['Tags'].apply(lambda d: d.get('JJ',0)+d.get('JJR',0)+d.get('JJS',0))
data['Verb Count'] = data['Tags'].apply(lambda d: d.get('VB',0)+d.get('VBD',0)+d.get('VBG',0)+d.get('VBN',0)+d.get('VBP',0)+d.get('VBZ',0))
data.drop(['Tags'],axis=1,inplace=True)

data.head()

Unnamed: 0,News Category,Documents,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count
0,sports,dana white doesnt care jon jones fight ufc tha...,30,4,13,11,5
1,sports,tennessee titan offseason roster among oldest ...,30,4,14,5,5
2,sports,giant open offense rookie receiver kadarius to...,28,5,18,7,1
3,sports,grizzly v warrior betting pick prediction poin...,36,7,18,8,7
4,sports,chris webber elected naismith memorial hall fa...,25,4,12,8,3


# Word2vec Feature Extraction

In [7]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from time import time 
import multiprocessing
import logging  # logger
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

In [8]:
cores = multiprocessing.cpu_count()

# initializing word2vec model
#shallow neural network model
model = Word2Vec(min_count=20,
                     window=2, # window size for context 
                     vector_size=100,  # no of features 
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

INFO - 11:42:20: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-05-16T11:42:20.799873', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'created'}


In [9]:
t = time()

# build vocabulary  and learns word associations in text
model.build_vocab(data['Documents'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:42:20: collecting all words and their counts
INFO - 11:42:20: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:42:20: collected 36 word types from a corpus of 9281 raw words and 48 sentences
INFO - 11:42:20: Creating a fresh vocabulary
INFO - 11:42:20: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 25 unique words (69.44444444444444%% of original 36, drops 11)', 'datetime': '2021-05-16T11:42:20.838528', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 11:42:20: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 9213 word corpus (99.26732033186079%% of original 9281, drops 68)', 'datetime': '2021-05-16T11:42:20.839077', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'ev

Time to build vocab: 0.0 mins


In [10]:
t = time()

#train word2vec model 
model.train(data['Documents'], total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:42:20: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 25 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-05-16T11:42:20.851930', 'gensim': '4.0.1', 'python': '3.7.5 (default, Oct 25 2019, 10:52:18) \n[Clang 4.0.1 (tags/RELEASE_401/final)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'train'}
INFO - 11:42:20: worker thread finished; awaiting finish of 6 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 5 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 4 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 3 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 2 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 1 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 0 more threads
INFO - 11:42:20: EPOCH - 1 : training on 9281 raw words (313 effective words) took 0.0s,

INFO - 11:42:20: worker thread finished; awaiting finish of 4 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 3 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 2 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 1 more threads
INFO - 11:42:20: worker thread finished; awaiting finish of 0 more threads
INFO - 11:42:20: EPOCH - 13 : training on 9281 raw words (343 effective words) took 0.0s, 64242 effective words/s
INFO - 11:42:21: worker thread finished; awaiting finish of 6 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 5 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 4 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 3 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 2 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 1 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 0

INFO - 11:42:21: worker thread finished; awaiting finish of 5 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 4 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 3 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 2 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 1 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 0 more threads
INFO - 11:42:21: EPOCH - 26 : training on 9281 raw words (337 effective words) took 0.0s, 59269 effective words/s
INFO - 11:42:21: worker thread finished; awaiting finish of 6 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 5 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 4 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 3 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 2 more threads
INFO - 11:42:21: worker thread finished; awaiting finish of 1

Time to train the model: 0.01 mins


In [11]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(words, model, num_features):
    """
    Calculate average feature vectors for all headlines 
    """
    counter = 0
    feature_vecs = np.zeros((len(words),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for word in words:
        feature_vecs[counter] = make_feature_vec(word, model, num_features)
        counter = counter + 1
    return feature_vecs

In [12]:
word2vec = get_avg_feature_vecs(data['Documents'], model, 100)

In [13]:
# remove instances in test set that could not be represented as feature vectors
nan_indices = list({x for x,y in np.argwhere(np.isnan(word2vec))})
if len(nan_indices) > 0:
    print('Removing {:d} instances from test set.'.format(len(nan_indices)))
    word2vec = np.delete(word2vec, nan_indices, axis=0)
    word2vec.drop(data.iloc[nan_indices, :].index, axis=0, inplace=True)
    assert word2vec.shape[0] == len(data)

In [14]:
w2v = pd.DataFrame(word2vec)

data.reset_index(drop=True, inplace=True)
w2v.reset_index(drop=True, inplace=True)
#df = pd.concat([df1, df2], axis=1)
w2v = pd.concat([data[['News Category','Word Count','Noun Phrases','Noun Count',
                                         'Adjective Count','Verb Count']],w2v],axis=1)

w2v.head()

Unnamed: 0,News Category,Word Count,Noun Phrases,Noun Count,Adjective Count,Verb Count,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,sports,30,4,13,11,5,-0.077073,0.249286,0.141428,0.232694,...,0.229867,0.186848,0.002071,0.142191,0.508744,0.057262,-0.016012,-0.255578,0.072951,0.111253
1,sports,30,4,14,5,5,-0.077828,0.249213,0.14138,0.235164,...,0.229121,0.186928,0.001202,0.142918,0.509141,0.057069,-0.017085,-0.255837,0.073274,0.11101
2,sports,28,5,18,7,1,-0.077331,0.249805,0.141737,0.234225,...,0.229294,0.186665,0.001828,0.141685,0.50907,0.057436,-0.015737,-0.255995,0.073331,0.110955
3,sports,36,7,18,8,7,-0.077276,0.249513,0.141506,0.234059,...,0.229165,0.18744,0.001823,0.142213,0.509231,0.057501,-0.016305,-0.255746,0.073567,0.110787
4,sports,25,4,12,8,3,-0.077441,0.250204,0.140386,0.233805,...,0.228812,0.187703,0.002199,0.142848,0.509732,0.056591,-0.015792,-0.25574,0.072936,0.111116


# Import Logistic Regression Model

In [15]:
import pickle

In [16]:
Pkl_Filename = "Pickle_RL_Model.pkl"  

with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

LogisticRegression(solver='newton-cg')

# Run LR Model on Real-Time Data with W2V Features

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder

In [18]:
# X dataframe 
X_test = w2v.drop(['News Category'],axis=1) 
# y series
y_test = w2v['News Category']

In [19]:
y_test = y_test.to_numpy()
y_pred = Pickled_LR_Model.predict(X_test)

lr_acc = accuracy_score(y_test,y_pred)
lr_recall = recall_score(y_test,y_pred,average='macro')
lr_precision = precision_score(y_test,y_pred,average='macro')
lr_f1 = f1_score(y_test,y_pred,average='macro')

y_pred_roc = OneHotEncoder().fit(y_test.reshape(-1, 1)).transform(y_pred.reshape(-1,1)).toarray()
y_test_roc = OneHotEncoder().fit_transform(y_test.reshape(-1, 1)).toarray()
lr_roc = roc_auc_score(y_test_roc,y_pred_roc,multi_class='ovo')

print("Logistic Regression: ",lr_acc,lr_recall,lr_precision,lr_f1,lr_roc)

Logistic Regression:  0.25 0.25 0.0625 0.1 0.5
