In [197]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import utils as utils
from datetime import datetime as dt
from newspaper import Article
import os

In [64]:
url = 'https://www.investing.com/equities/trending-stocks'
BASE_URL = 'https://www.investing.com'
request = requests.get(url).text
raw_text = BeautifulSoup(request, 'html.parser')
trending_stocks = raw_text.find('div', {'id': 'trendingInnerContent'})
stock_elements = trending_stocks.find_all('td', class_='left bold plusIconTd elp')

In [67]:
def extract_stock_info(stock_elements):
    stock_dict = {'company': [], 'stock': [], 'link': []}
    for element in stock_elements:
        company_name = element.find('a')['title']
        company_name = company_name.replace('\xa0', ' ')
        stock_name = element.find('a').text
        link = element.find('a')['href']
        full_link = f'{BASE_URL}{link}'
        stock_dict['company'].append(company_name)
        stock_dict['stock'].append(stock_name)
        stock_dict['link'].append(full_link)
    df = pd.DataFrame(stock_dict)
    utils.create_path('../datasets')
    df.to_csv('../datasets/stocks.csv', index=False)
    return stock_dict, df
stock_dict, df = extract_stock_info(stock_elements)

In [140]:
news_list_path = utils.create_path('../datasets/news_links')
def extract_news_links(stock_dict, news_list_path, max_num_pages=1):
    for inx, (stock_name, link) in enumerate(zip(stock_dict['stock'], stock_dict['link'])):
        try:
            full_link = f'{link}-news'
            for page in range(1, max_num_pages + 1):
                full_link = f'{link}-news/{page}'
                request = requests.get(full_link).text
                bs4 = BeautifulSoup(request, 'html.parser')
                news_table = bs4.find('ul', {'data-test': 'news-list'})
                news_list = news_table.find_all('article', {'data-test': 'article-item'})
                with open(f'{news_list_path}/{stock_name}.txt', 'w') as file:
                    for news_data in news_list:
                        if str(news_data).find('mt-2.5') == -1:
                            news_link = news_data.findAll('a')[1]['href']
                            full_link = f'{BASE_URL}{news_link}'
                            file.write(f'{full_link}\n')
        except Exception as e:
            print(f'Error for stock {stock_name}: {e}')

In [141]:
def create_dict_of_links(news_list_path):
    news_dict = {}
    for file_name in os.listdir(news_list_path):
        with open(f'{news_list_path}/{file_name}', 'r') as file:
            lines = file.readlines()
            lines = list(set(lines))
        stock_name = file_name.replace('.txt', '')
        for line in lines:
            if stock_name in news_dict:
                news_dict[stock_name].append(line.replace('\n', ''))
            else:
                news_dict[stock_name] = [line.replace('\n', '')]
    return news_dict
news_dict = create_dict_of_links(news_list_path)

In [223]:
def extract_news(news_dict):
    df = pd.DataFrame(columns=['stock', 'title', 'text', 'date', 'time', 'am_pm'])
    stock_list = []
    title_list = []
    date_list = []
    time_list = []
    am_pm_list = []
    text_list = []
    for inx, stock_name in enumerate(news_dict):
        if inx > 5:
            break
        for link in news_dict[stock_name]:
            stock_list.append(stock_name)
            request = requests.get(link).text
            bs4 = BeautifulSoup(request, 'html.parser')
            # parsing the title of the article
            try:
                header = bs4.find('h1', {'class': 'articleHeader'}).text
                title_list.append(header)
            except Exception as e:
                title_list.append(None)
                print(f'Error in parsing ""Title(header)"" in stock: {stock_name} is: {e}')
            # parsing the date and time of the article
            try:
                datetime = bs4.findAll('div', {'class': 'contentSectionDetails'})[1].find('span').text
                datetime = datetime.replace('Published ', '')[:-3]
                datetime = dt.strptime(datetime, '%b %d, %Y %I:%M%p')
                time = datetime.strftime('%H:%M')
                date = datetime.strftime('%Y-%m-%d')
                am_pm = datetime.strftime('%p')
                date_list.append(date)
                time_list.append(time)
                am_pm_list.append(am_pm)
            except Exception as e:
                date_list.append(None)
                time_list.append(None)
                am_pm_list.append(None)
                print(f'Error in parsing ""datetime"" in stock: {stock_name} is: {e}')
            # parsing the body of the article
            article = Article(link)
            article.download()
            try:
                article.parse() 
                text = article.text
                if text.startswith('Published'):
                    index = text.find('\n') + 2
                    text = text[index:]
                text_list.append(text)  
            except Exception as e:
                text_list.append(None)
                print(f'Error in parsing ""article body"" in stock: {stock_name} is: {e}')
    df['stock'], df['title'], df['text'] = stock_list, title_list, text_list
    df['date'], df['time'], df['am_pm'] = date_list, time_list, am_pm_list   
    return df
df = extract_news(news_dict)

In [210]:
df.to_csv('../datasets/news.csv', index=False)

In [224]:
df['text']

0     © Reuters Adobe (ADBE) Brings Conversational A...
1     © Reuters. Figurines with computers and smartp...
2     © Reuters. FILE PHOTO: Volunteers watch for vo...
3     © Reuters. FILE PHOTO: European Union flags fl...
4     © Reuters. FILE PHOTO: People walk behind a lo...
5     © Reuters. Amazon Withdraws From iRobot Deal A...
6     © Reuters.\n\nCRM -0.30% Add to/Remove from Wa...
7     © Reuters.\n\nADBE +2.95% Add to/Remove from W...
8     © Reuters. The exterior of the Warner Bros. Di...
9     © Reuters. Alibaba (BABA) cut at Macquarie as ...
10    © Reuters. FILE PHOTO: A Squishmallow depictin...
11    2/2 © Reuters. FILE PHOTO: A keyboard and a sh...
12    © Reuters. FILE PHOTO: A passerby walks past a...
13    © Reuters\n\nBABA -0.21% Add to/Remove from Wa...
14    © Reuters.\n\nUS500 +0.03% Add to/Remove from ...
15    © Reuters. FILE PHOTO: A smartphone with a dis...
16                            Please try another search
17    © Reuters. FILE PHOTO: A self-driving GM B