In [1]:
from eodhd import APIClient
import requests 
import pandas as pd
import time
from tqdm import tqdm
import os

api = "64d77f6d3a60a5.24835840"

In [4]:
# Log function
def log_News(logfile,date,title,path=os.getcwd()):
    # Open or create the csv file
    if os.path.isfile(logfile): #If the log file exists, open it and allow for changes     
        log = open(logfile,'a')
    else: #If the log file does not exist, create it and make headers for the log variables
        log = open(logfile,'w')
        header = ['Timestamp','Date','Title']
        log.write(';'.join(header) + "\n") #Make the headers and jump to new line
        
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #Local time
    
    # Open the log file and append the gathered log information
    with open(logfile,'a') as log:
        log.write(f'{timestamp};{date};{title};{path}' + "\n") #Append the information and jump to new line



In [5]:
# Change the current working directory to the folder where the script is located
os.chdir('C:/Users/Soren/Documents/GitHub/IntroSocial23/Exam final/Data collection')

# Get the current working directory
current_directory = os.getcwd()

# Construct the path to the CSV file
csv_file_path = os.path.join(current_directory,'TickerNames_final.csv')

# Read the CSV file and create a DataFrame
df_companies = pd.read_csv(csv_file_path)

# make df_companies into a dataframe
df_companies = pd.DataFrame(df_companies)

In [6]:
print(df_companies)

     Symbol                                               Name
0      AAPL                                          Apple Inc
1      MSFT                 Microsoft Corporation Common Stock
2      GOOG                                       Alphabet Inc
3     GOOGL                                       Alphabet Inc
4      AMZN                                             Amazon
...     ...                                                ...
2017   ICCH                                   ICC Holdings Inc
2018   ESOA  Energy Services of America Corporation Common ...
2019   CURI                                CuriosityStream Inc
2020   NSTS                                   NSTS Bancorp Inc
2021   STIM                                    Neuronetics Inc

[2022 rows x 2 columns]


In [7]:
# Get all news

# Dates
start_date = '2022-06-01'
end_date = '2023-06-01'

# number of articles pr. stock
n_news = 1000
# stock_list = ['AAPL','TSLA']
stock_list = df_companies['Symbol']
# company_names = ['Apple', 'Tesla']
company_names = df_companies['Name']



def get_all_news(stocks, start_date, end_date, n_news, api_key, company, offset = 0):
    logfile = 'log_News.csv'

    # create empty dataframe
    all_news = pd.DataFrame()
    for stock in tqdm(stocks, desc="Fetching News", unit="stock"):
        # url for the api call
        url = f'https://eodhistoricaldata.com/api/news?api_token={api_key}&s={stock}&limit={n_news}&offset={offset}&from={start_date}&to={end_date}'
        # get the json from the api call
        response = requests.get(url)
        news_json = response.json()
        
        # create dataframe from json
        df_news = pd.DataFrame.from_dict(news_json)
        
        # a new column with the company name 
        df_news['ticker'] = stock
        df_news['company'] = company
        
        # concat df_news onto all_news
        all_news = pd.concat([all_news, df_news], ignore_index = True)
        
        date = all_news['date']
        title = all_news['title']        
        
        # Log the response
        log_News(logfile, date, title)
        
        all_news = all_news[['date','title','symbols','sentiment','ticker','company']]
        # save the dataframe
        all_news.to_csv('df_all_news_final.csv', sep= ';', index=False)
        
    return all_news 

df_all_news_final = get_all_news(stock_list, start_date, end_date, n_news, api, company_names)

Fetching News:   0%|          | 0/2022 [00:00<?, ?stock/s]

Fetching News: 100%|██████████| 2022/2022 [1:43:56<00:00,  3.08s/stock]  


In [54]:
# see the shape
df_all_news_final.shape

# copy of df_all_news_final
df_all_news_copy = df_all_news_final.copy()

# drop na's
df_all_news_copy_na = df_all_news_copy.copy().dropna()

# drop duplicates
df_all_news_copy_duplicates = df_all_news_copy_na.drop_duplicates(subset=['date', 'title','ticker'], keep='first')

# reset index
df_all_news_duplicates = df_all_news_copy_duplicates.reset_index(drop=True)

# remove the column company from the df_all_news_duplicates
df_all_news_duplicates = df_all_news_duplicates.drop(columns=['company'])

# Using list comprehension to match the tickers in df_all_news_duplicates with the symbol in df_companies and save the company name in a new column
df_all_news_duplicates['company'] = [df_companies[df_companies['Symbol'] == ticker]['Name'].values[0] for ticker in df_all_news_duplicates['ticker']]

Unnamed: 0,date,title,symbols,sentiment,ticker,company
0,2023-06-01T22:45:24+00:00,Apple denies hacking thousands of iPhones in R...,"[AAPL.MX, AAPL.US, AAPL34.SA, APC.F, APC.XETRA]","{'polarity': 0.94, 'neg': 0.055, 'neu': 0.848,...",AAPL,Apple Inc
1,2023-06-01T22:02:25+00:00,Russia Accuses US Intelligence of Hacking Thou...,"[AAPL.MX, AAPL.US, AAPL34.SA, APC.F, APC.XETRA]","{'polarity': -0.942, 'neg': 0.091, 'neu': 0.83...",AAPL,Apple Inc
2,2023-06-01T21:43:00+00:00,Broadcom CEO Sees Rising AI Chip Demand. Earni...,"[AAPL.MX, AAPL.US, AAPL34.SA, APC.F, APC.XETRA...","{'polarity': 0.273, 'neg': 0, 'neu': 0.9, 'pos...",AAPL,Apple Inc
3,2023-06-01T20:55:29+00:00,Dow Jones Today: Debt Vote Drives Index Higher,"[AAPL.MX, AAPL.US, AAPL34.SA, AEC1.F, AEC1.XET...","{'polarity': 0.572, 'neg': 0, 'neu': 0.893, 'p...",AAPL,Apple Inc
4,2023-06-01T20:24:46+00:00,Trillion-dollar companies: How wealthy are the...,"[AAPL.MX, AAPL.US, AAPL34.SA, ABEA.F, ABEA.XET...","{'polarity': 0.44, 'neg': 0.061, 'neu': 0.847,...",AAPL,Apple Inc
...,...,...,...,...,...,...
180328,2022-07-13T12:31:00+00:00,Neuronetics Recognized as Top Non-Invasive Dev...,[STIM.US],"{'polarity': 0.929, 'neg': 0.079, 'neu': 0.795...",STIM,Neuronetics Inc
180329,2022-07-11T20:46:00+00:00,Neuronetics Announces Positive TMS Coverage Po...,[STIM.US],"{'polarity': 0.927, 'neg': 0.079, 'neu': 0.814...",STIM,Neuronetics Inc
180330,2022-06-27T18:55:25+00:00,Activist investor takes 13% stake in Malvern-b...,"[NRC.F, STIM.US]","{'polarity': 0.527, 'neg': 0, 'neu': 0.947, 'p...",STIM,Neuronetics Inc
180331,2022-06-27T11:43:11+00:00,"ClearPoint Neuro, Inc. (CLPT) Surges 7.6%: Is ...","[NRC.F, STIM.US]","{'polarity': 0.984, 'neg': 0.011, 'neu': 0.884...",STIM,Neuronetics Inc


In [55]:
# save to csv with ; separator
df_all_news_duplicates.to_csv('news_final_no_duplicates_3.csv', sep=';', index=False)

In [46]:
# count unique titles
# unique_tickers = df_all_news_duplicates['ticker'].unique()

# # save unique titles as csv file
# unique_tickers_df = pd.DataFrame(unique_tickers)
# unique_tickers_df.to_csv('unique_tickers.csv', index=False)

# open from csv the file news_final_no_duplicates_3.csv


In [None]:
# Tickers list for master dataframe
tickers_list = df_all_news_final['ticker'].unique().tolist()
tickers_list
# Save tickers list as csv 
df_tickers_list = pd.DataFrame(tickers_list)
df_tickers_list.to_csv('tickers_list_news.csv', index=False)


In [None]:
# save df_all_news to csv
df_all_news_final.to_csv('df_all_news_final_BACKUP.csv', index=False)



In [18]:
df_all_news_copy.dtypes

# change the columns title, ticker and company to string
df_all_news_copy[['title','ticker','company']] = df_all_news_copy[['title','ticker','company']].astype(str)
df_all_news_copy.dtypes


date         object
title        object
symbols      object
sentiment    object
ticker       object
company      object
dtype: object

In [19]:
average_length_title = df_all_news_copy['title'].apply(len).mean()

# average_length_article = df_all_news_final['content'].apply(len).mean()

print('Average length of title: ', average_length_title)
# print('Average length of article: ', average_length_article)

Average length of title:  46.682490303078126


In [20]:
import pandas as pd

# Assuming df_all_news is your DataFrame
# Calculate the average number of words, commas, and periods in the 'title' and 'content' columns
df_all_news_copy['title_words'] = df_all_news_copy['title'].apply(lambda x: len(x.split()))
df_all_news_copy['title_commas'] = df_all_news_copy['title'].apply(lambda x: x.count(','))
df_all_news_copy['title_periods'] = df_all_news_copy['title'].apply(lambda x: x.count('.'))

# df_all_news_final['content_words'] = df_all_news_final['content'].apply(lambda x: len(x.split()))
# df_all_news_final['content_commas'] = df_all_news_final['content'].apply(lambda x: x.count(','))
# df_all_news_final['content_periods'] = df_all_news_final['content'].apply(lambda x: x.count('.'))

# Calculate the average values for words, commas, and periods in title and content
average_words_title = df_all_news_copy['title_words'].mean()
average_commas_title = df_all_news_copy['title_commas'].mean()
average_periods_title = df_all_news_copy['title_periods'].mean()

# average_words_content = df_all_news_final['content_words'].mean()
# average_commas_content = df_all_news_final['content_commas'].mean()
# average_periods_content = df_all_news_final['content_periods'].mean()

# Print the results
print('Average number of words in title: ', average_words_title)
print('Average number of commas in title: ', average_commas_title)
print('Average number of periods in title: ', average_periods_title)

# print('Average number of words in content: ', average_words_content)
# print('Average number of commas in content: ', average_commas_content)
# print('Average number of periods in content: ', average_periods_content)

Average number of words in title:  7.321816404945886
Average number of commas in title:  0.23402511791092792
Average number of periods in title:  0.15606125905197793
