In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import utils as utils
from datetime import datetime as dt
from newspaper import Article
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Getting the Trending Stocks
We will scrape investing.com for trending stocks

In [28]:
url = 'https://www.investing.com/equities/trending-stocks'
BASE_URL = 'https://www.investing.com'
request = requests.get(url).text
raw_text = BeautifulSoup(request, 'html.parser')
trending_stocks = raw_text.find('div', {'id': 'trendingInnerContent'})
stock_elements = trending_stocks.find_all('td', class_='left bold plusIconTd elp')

Getting the stock info including the company full name, stock name, and the link to the news page and other information of the stock in investing.com

In [29]:
def extract_stock_info(stock_elements):
    stock_dict = {'company': [], 'stock': [], 'link': []}
    for element in stock_elements:
        company_name = element.find('a')['title']
        company_name = company_name.replace('\xa0', ' ')
        stock_name = element.find('a').text
        link = element.find('a')['href']
        full_link = f'{BASE_URL}{link}'
        stock_dict['company'].append(company_name)
        stock_dict['stock'].append(stock_name)
        stock_dict['link'].append(full_link)
    df = pd.DataFrame(stock_dict)
    return stock_dict, df
stock_dict, df = extract_stock_info(stock_elements)

Using the constant json file named ticker.json to add the ticker name of the stocks in the dataframe

In [40]:
def get_ticker_names(json_path, df):
    with open(json_path, 'r') as f:
        stock_ticker_dict = json.load(f)

    ticker_list = []
    for stock_name in df['stock']:
        if stock_name in stock_ticker_dict:
            ticker_list.append(stock_ticker_dict[stock_name])
        else:
            print(f'{stock_name} not found in ticker list, you need to update the json file')
    df['ticker'] = ticker_list
    
    utils.create_path('../datasets')
    df.to_csv('../datasets/stocks.csv', index=False)
    return df, stock_ticker_dict

json_path = '../datasets/ticker.json'
df, stock_ticker_dict = get_ticker_names(json_path, df)

Extracting the news links based on the stock name and saving them in a file with the name of the ticker.
<br>
You can change the number of pages to scrape more news articles. by default each page has 10 news. 
<br>
Some of the news are tagged as pro and you need to subscribe to investing.com to get the articles. So we ignore them.

In [38]:
df.head()

Unnamed: 0,company,stock,link,ticker
0,Tesla Inc,Tesla,https://www.investing.com/equities/tesla-motors,TSLA
1,Apple Inc,Apple,https://www.investing.com/equities/apple-compu...,AAPL
2,Beyond Meat Inc,Beyond Meat,https://www.investing.com/equities/beyond-meat...,BYND
3,Coinbase Global Inc,Coinbase Global,https://www.investing.com/equities/coinbase-gl...,COIN
4,Salesforce Inc,Salesforce Inc,https://www.investing.com/equities/salesforce-com,CRM


In [41]:
news_list_path = utils.create_path('../datasets/news_links')
def extract_news_links(stock_dict, news_list_path, stock_ticker_dict,  max_num_pages=1):
    for inx, (stock_name, link) in enumerate(zip(stock_dict['stock'], stock_dict['link'])):
        stock_ticker = stock_ticker_dict[stock_name]
        try:
            full_link = f'{link}-news'
            for page in range(1, max_num_pages + 1):
                full_link = f'{link}-news/{page}'
                request = requests.get(full_link).text
                bs4 = BeautifulSoup(request, 'html.parser')
                news_table = bs4.find('ul', {'data-test': 'news-list'})
                news_list = news_table.find_all('article', {'data-test': 'article-item'})
                with open(f'{news_list_path}/{stock_ticker}.txt', 'w') as file:
                    for news_data in news_list:
                        if str(news_data).find('mt-2.5') == -1:
                            news_link = news_data.findAll('a')[1]['href']
                            full_link = f'{BASE_URL}{news_link}'
                            file.write(f'{full_link}\n')
        except Exception as e:
            print(f'Error for stock {stock_name}: {e}')
extract_news_links(stock_dict, news_list_path, stock_ticker_dict, max_num_pages=1)

Creating a dictionary of the links available from the files that created for each news' links

In [42]:
def create_dict_of_links(news_list_path):
    news_dict = {}
    for file_name in os.listdir(news_list_path):
        with open(f'{news_list_path}/{file_name}', 'r') as file:
            lines = file.readlines()
            lines = list(set(lines))
        stock_name = file_name.replace('.txt', '')
        for line in lines:
            if stock_name in news_dict:
                news_dict[stock_name].append(line.replace('\n', ''))
            else:
                news_dict[stock_name] = [line.replace('\n', '')]
    return news_dict
news_dict = create_dict_of_links(news_list_path)

In [73]:
def extract_news(news_dict):
    df = pd.DataFrame(columns=['stock', 'title', 'text', 'date', 'time', 'am_pm'])
    stock_list = []
    title_list = []
    date_list = []
    time_list = []
    am_pm_list = []
    text_list = []
    for inx, stock_name in enumerate(news_dict):
        if inx > 5:
            break
        for link in news_dict[stock_name]:
            stock_list.append(stock_name)
            request = requests.get(link).text
            bs4 = BeautifulSoup(request, 'html.parser')
            # parsing the title of the article
            try:
                header = bs4.find('h1', {'class': 'articleHeader'}).text
                title_list.append(header)
            except Exception as e:
                title_list.append(None)
                print(f'Error in parsing ""Title(header)"" in stock: {stock_name} is: {e}')
            # parsing the date and time of the article
            try:
                datetime = bs4.findAll('div', {'class': 'contentSectionDetails'})[1].find('span').text
                datetime = datetime.replace('Published ', '')[:-3]
                datetime = dt.strptime(datetime, '%b %d, %Y %I:%M%p')
                time = datetime.strftime('%H:%M')
                date = datetime.strftime('%Y-%m-%d')
                am_pm = datetime.strftime('%p')
                date_list.append(date)
                time_list.append(time)
                am_pm_list.append(am_pm)
            except Exception as e:
                date_list.append(None)
                time_list.append(None)
                am_pm_list.append(None)
                print(f'Error in parsing ""datetime"" in stock: {stock_name} is: {e}')
                
            try:
                text = bs4.find('div', {'class': 'WYSIWYG articlePage'})
                all_ps = text.findAll('p')
                text = ''
                for each_p in all_ps:
                    text = text + each_p.text 
                    
                if text == '':
                    print(f'Error in parsing ""article body"" in stock: {stock_name} is: {e}')
                
                text = text.replace('Position added successfully to:', '')
                text = text.replace('\n', ' ')    
                text_list.append(text) 
            except Exception as e:
                print(f'Error in parsing ""article body"" in stock: {stock_name} is: {e}')
                
    df['stock'], df['title'], df['text'] = stock_list, title_list, text_list
    df['date'], df['time'], df['am_pm'] = date_list, time_list, am_pm_list   
    return df
df = extract_news(news_dict)

In [74]:
df.head()

Unnamed: 0,stock,title,text,date,time,am_pm
0,AAPL,BofA expects 'strong refresh cycle' for iPhone...,Citing findings from their global smartphone...,2024-02-28,07:49,AM
1,AAPL,Apple cancels decade-long electric car project...,By Stephen Nellis and Shivansh Tiwary(Reuter...,2024-02-27,15:47,PM
2,AAPL,Marketmind: Calm prevails before inflation dat...,A look at the day ahead in European an...,2024-02-28,00:41,AM
3,AAPL,"Apple Halts Electric Car Project Titan, Shifts...",Quiver Quantitative - In a surprising ...,2024-02-27,15:44,PM
4,AAPL,Marketmind: US tracking 3%+ growth; Apple down...,A look at the day ahead in U.S. and global...,2024-02-28,06:02,AM


In [75]:
df.to_csv('../datasets/stock_news.csv', index=False)