<a href="https://colab.research.google.com/github/skywalker290/Financial-News-Analyser/blob/main/Finance_News_Article_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (

In [None]:
import pandas as pd
import requests
from newspaper import Article
import lxml.html.clean
import nltk
nltk.download('punkt')

import yfinance as yf
from tqdm import tqdm



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def get_company_symbols():
  """
  Returns list of company Symbols form S&P Index
  """
  wikipedia_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
  snp_company_list = wikipedia_data[0]
  snp_changes = wikipedia_data[1]
  snp_ticker_symbols = snp_company_list.Symbol.tolist()
  return snp_ticker_symbols


def get_company_names():
  """
  Returns list of company names form S&P Index
  """
  wikipedia_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
  snp_company_list = wikipedia_data[0]
  snp_changes = wikipedia_data[1]
  snp_ticker_symbols = snp_company_list.Symbol.tolist()
  return snp_company_list['Security'].tolist()



def get_article_links(company, size=10):
  """
  company-> symbol
  size-> Number of links needed

  return:
  list of links for articles
  """

  url = f"https://api.queryly.com/cnbc/json.aspx?queryly_key=31a35d40a9a64ab3&query={company}&endindex=40&batchsize={size*3}&callback=&showfaceted=false&timezoneoffset=-330&facetedfields=formats&facetedkey=formats%7C&facetedvalue=!Press%20Release%7C&additionalindexes=4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"

  response = requests.get(url)
  links = []
  # print(company)
  try:
    if response.status_code == 200:
        data = response.json()
        for i in data['results']:
          if i['cn:type']!='cnbcvideo':
            links.append(i['url'])

    else:
        print("Error:", response.status_code)
  except Exception as e:
    print(company,':',e)

  return links[:size]


  #####

print(get_article_links('AES Corporation'))




def get_article(url):
  """
  input-> URL of aritcle

  Return-> [article.title, article.text, article.publish_date]

  """

  try:
    article = Article(url, language="en")
    article.download()
    article.parse()
    article.nlp()

    return [article.title, article.text, str(article.publish_date)[:10]]

  except Exception as e:
    print(url,':',e)
    return []



#1
def get_links_DataFrame():
    """
    Returns:
        DataFrame['links','symbol','company']
    """
    names = get_company_names()
    symbols = get_company_symbols()

    final_DataFrame = pd.DataFrame(columns=['links', 'symbol', 'company'])

    pbar = tqdm(total=len(names), desc="Processing Companies", unit="company")

    for i in range(len(names)):
        links = get_article_links(names[i])
        new_df = pd.DataFrame(links, columns=['links'])
        new_df['symbol'] = symbols[i]
        new_df['company'] = names[i]
        final_DataFrame = pd.concat([final_DataFrame, new_df], axis=0)

        pbar.update(1)

    pbar.close()

    return final_DataFrame



#2
def get_article_data(df):
    """
    df['links'] -> links for all the articles

    returns:
    returns df with article data[title,txt,publishdate]
    """
    data = []

    pbar = tqdm(total=len(df), desc="Fetching Article Data", unit="article")

    for i in range(len(df)):
        data.append(get_article(df.iloc[i, 0]))

        pbar.update(1)

    pbar.close()

    new_df = pd.DataFrame(data, columns=['Title', 'Text', 'Publishdate'])
    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, new_df], axis=1)
    # df['Title']=new_df['Title']
    # df['Text']=new_df['Text']
    # df['Publishdate']=new_df['Publishdate']
    return df



def price_change(symbol, date, limit=10):
    '''
    symbol -> company symbol
    limit -> for in active market days

    '''

    if(limit<=0):
      return None
    try:

      stock_data = yf.download(symbol, start=date, end=(pd.to_datetime(date) + pd.Timedelta(days=1)).strftime('%Y-%m-%d'), progress = False)

      if not stock_data.empty:
        return stock_data['Close'].iloc[0] - stock_data['Open'].iloc[0]
      else:
        next_day = (pd.to_datetime(date) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
        return price_change(symbol, next_day, limit -1)
    except Exception as e:
      return





def add_stock_change_column(df):
    """
    Calculate stock changes and add a 'Change' column to the DataFrame.

    Args:
    df (DataFrame): DataFrame containing 'symbol' and 'publishdate' columns.

    Returns:
    DataFrame: DataFrame with an additional 'Change' column.
    """
    changes = []

    pbar = tqdm(total=len(df), desc="Calculating Changes", unit="article")
    for i in range(len(df)):
        change = price_change(df['symbol'][i], df['Publishdate'][i])
        changes.append(change)

        pbar.update(1)

    pbar.close()

    df['Change'] = changes
    return df




# def get_articles_data(urls, df,symbol):
#     """
#     Inputs:
#       Urls -> list of links for the articles
#       df -> df to which the extracted data is to be concartenated
#       symbol-> stock symbol of the company
#     """
#     data = []
#     for url in urls:
#         article_data = get_article(url)
#         if article_data:
#             data.append(article_data)

#     new_df = pd.DataFrame(data, columns=['Title', 'Text', 'PublishDate','symbol'])
#     new_df['symbol'] = symbol
#     df = pd.concat([df, new_df], axis=1)
#     return df


def Main():
  # df = get_links_DataFrame()
  # df.to_csv('/content/drive/MyDrive/stock_data_links.csv', index=False)
  # df = get_article_data(df)
  # df.to_csv('/content/drive/MyDrive/stock_data_articles.csv', index=False)
  df = pd.read_csv('/content/drive/MyDrive/stock_data_articles.csv')
  df = add_stock_change_column(df)

  df.to_csv('/content/drive/MyDrive/stock_data_Complete.csv', index=False)

  return df














AES Corporation : 'cn:type'
['https://www.cnbc.com/select/best-big-bank-checking-accounts-2023/', 'https://www.cnbc.com/2023/08/26/amazon-biometric-payments-privacy-concerns.html']


In [None]:
companies_names = get_company_names()

companies = get_company_symbols()

df = Main()

df








[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Calculating Changes:  27%|██▋       | 1237/4589 [04:14<10:26,  5.35article/s][A[A

Calculating Changes:  27%|██▋       | 1238/4589 [04:14<10:53,  5.13article/s][A[A

Calculating Changes:  27%|██▋       | 1239/4589 [04:14<11:06,  5.03article/s][A[A

Calculating Changes:  27%|██▋       | 1240/4589 [04:15<10:49,  5.16article/s][A[A

Calculating Changes:  27%|██▋       | 1241/4589 [04:15<10:10,  5.48article/s][A[A

Calculating Changes:  27%|██▋       | 1242/4589 [04:15<10:24,  5.36article/s][A[A

Calculating Changes:  27%|██▋       | 1243/4589 [04:15<09:44,  5.72article/s][A[A

Calculating Changes:  27%|██▋       | 1244/4589 [04:15<09:45,  5.71article/s][A[A

Calculating Changes:  27%|██▋       | 1245/4589 [04:15<09:54,  5.63article/s][A[A

Calculating Changes:  27%|██▋       | 1246/4589 [04:16<09:21,  5.96article/s][A[A

Calculating Changes:  27%|██▋       | 1247/4589 [04:16<09:47,  5.68article/s][A[A



In [None]:
# ERROR:yfinance:['CDW']: Exception("%ticker%: Data doesn't exist for startDate = 1362632400, endDate = 1362718800")
# ance:['CEG']
# ERROR:yfinance:['FANG']: Exception("%ticker%: Data doesn't exist for startDate = 1349150400, endDate = 1349236800")
# ERROR:yfinance:['FOXA']: Exception("%ticker%: Data doesn't exist for startDate = 1311912000, endDate = 1311998400")


