# News API call

### Using News API to get news articles

In [2]:
from eodhd import APIClient
import requests 
import pandas as pd
import time
from tqdm import tqdm
import os

api = "64d77f6d3a60a5.24835840"

In [3]:
# Change working directory

# Set the path to the folder where the ticker names are stored
Path_tickers = os.getcwd()

# Read the CSV file and create a DataFrame
df_companies = pd.read_csv(Path_tickers+'/TickerNames_2022.csv')

# Changing df_companies into a dataframe
df_companies = pd.DataFrame(df_companies)

In [4]:
# Log function
def log_News(logfile,date,title,path=os.getcwd()):
    # Open or create the csv file
    if os.path.isfile(logfile): #If the log file exists, open it and allow for changes     
        log = open(logfile,'a')
    else: #If the log file does not exist, create it and make headers for the log variables
        log = open(logfile,'w')
        header = ['Timestamp','Date','Title','Path']
        log.write(';'.join(header) + "\n") #Make the headers and jump to new line
        
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #Local time
    
    # Open the log file and append save the timestap, date, title and path
    with open(logfile,'a') as log:
        log.write(f'{timestamp};{date};{title};{path}' + "\n") #Append the information and jump to new line



In [5]:
##### THIS TAKES A LONG TIME TO RUN #####

# Dates
start_date = '2022-06-01'
end_date = '2023-06-01'

# number of articles pr. stock, set to the maximum which is 1000
n_news = 1000
# List of stocks
stock_list = df_companies['Symbol']
# List of company names
company_names = df_companies['Name']

# Function to get retrieve all the news data from the EODHD API
def get_all_news(stocks, start_date, end_date, n_news, api_key, company, offset = 0):
    logfile = 'news_data_log.csv'

    # create empty dataframe
    all_news = pd.DataFrame()
    for stock in tqdm(stocks, desc="Fetching News", unit="stock"):
        # url for the api call
        url = f'https://eodhistoricaldata.com/api/news?api_token={api_key}&s={stock}&limit={n_news}&offset={offset}&from={start_date}&to={end_date}'
        # get the json from the api call
        response = requests.get(url)
        news_json = response.json()
        
        # create dataframe from json
        df_news = pd.DataFrame.from_dict(news_json)
        
        # a new column with the company name 
        df_news['ticker'] = stock
        df_news['company'] = company
        
        # concat df_news onto all_news
        all_news = pd.concat([all_news, df_news], ignore_index = True)
        
        date = all_news['date']
        title = all_news['title']        
        
        # Log the response
        log_News(logfile, date, title)
        
        all_news = all_news[['date','title','symbols','sentiment','ticker','company']]
        # save the dataframe
        all_news.to_csv('df_all_news_from_API.csv', sep= ';', index=False)
        
    return all_news 

df_all_news_final = get_all_news(stock_list, start_date, end_date, n_news, api, company_names)

Fetching News:   0%|          | 0/2022 [00:00<?, ?stock/s]

Fetching News:   0%|          | 1/2022 [00:07<4:20:25,  7.73s/stock]


KeyboardInterrupt: 

In [9]:
# read the csv file df_all_news_from_API.csv
df_all_news_final = pd.read_csv(os.getcwd()+'/News API/df_all_news_from_API.csv', sep=';')

In [10]:
# see the shape
df_all_news_final.shape

# copy of df_all_news_final
df_all_news_copy = df_all_news_final.copy()

# drop missing values from the rows
df_all_news_copy_na = df_all_news_copy.copy().dropna()

# drop duplicate news articles
df_all_news_copy_duplicates = df_all_news_copy_na.drop_duplicates(subset=['date', 'title','ticker'], keep='first')

# reset index
df_all_news_duplicates = df_all_news_copy_duplicates.reset_index(drop=True)

# remove the column company from the df_all_news_duplicates
df_all_news_duplicates = df_all_news_duplicates.drop(columns=['company'])

# Using list comprehension to make a new column, to match the tickers in df_all_news_duplicates with the symbol in df_companies and save the company name in a new column
df_all_news_duplicates['company'] = [df_companies[df_companies['Symbol'] == ticker]['Name'].values[0] for ticker in df_all_news_duplicates['ticker']]

In [11]:
# save to csv with ; separator
df_all_news_duplicates.to_csv('news_API_final.csv', sep=';', index=False)

In [15]:
# Tickers list consisting of all the tickers in news_API_final.csv
tickers_list = df_all_news_final['ticker'].unique().tolist()
# Save tickers list as csv 
df_tickers_list = pd.DataFrame(tickers_list)
df_tickers_list.to_csv('TickerNames_1956.csv', index=False)

# The amount of tickers is companies is reduced to 1956
len(tickers_list)

1956

In [13]:
# Calculating the average number of words, commas, and periods in the 'title' and 'content' columns to get an idea of the average number of tokens
df_all_news_copy['title_words'] = df_all_news_copy['title'].apply(lambda x: len(x.split()))
df_all_news_copy['title_commas'] = df_all_news_copy['title'].apply(lambda x: x.count(','))
df_all_news_copy['title_periods'] = df_all_news_copy['title'].apply(lambda x: x.count('.'))

# Calculate the average values for words, commas, and periods in title and content
average_words_title = df_all_news_copy['title_words'].mean()
average_commas_title = df_all_news_copy['title_commas'].mean()
average_periods_title = df_all_news_copy['title_periods'].mean()

# Print the results
print('Average number of words in title: ', average_words_title)
print('Average number of commas in title: ', average_commas_title)
print('Average number of periods in title: ', average_periods_title)
print('Average number of tokens in title: ', average_words_title + average_commas_title + average_periods_title)

Average number of words in title:  11.526065667404191
Average number of commas in title:  0.39598964138565873
Average number of periods in title:  0.25742376603283923
Average number of tokens in title:  12.17947907482269
