In [1]:
#Importing necessary libraries
from dotenv import load_dotenv
from datetime import datetime, timedelta
import requests
import os
import time
import pandas as pd 
from SML.news_preprocess import process_news_articles    #Importing everything from 'news_preprocessing'
from SML.news_preprocess import exponential_moving_average
load_dotenv()

True

In [None]:
#Defining a function for fetching news

def fetch_news(api_key, ticker, start_date, end_date):
    base_url = os.environ.get("endpointnewsp")
    headers = {"Authorization": f"Bearer {api_key}"}
    all_news = []
    
    current_date = start_date

    while current_date <= end_date:
        batch_end_date = current_date + timedelta(days=50)
        if batch_end_date > end_date:
            batch_end_date = end_date

        params = {
            "ticker": ticker,
            "published_utc.gte": current_date.strftime('%Y-%m-%d'),
            "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
            "limit": 50,
            "sort": "published_utc"
        }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data.get('results', [])
                
                # Creating a DataFrame from articles
                df = pd.DataFrame(articles)
                
                # Adding primary_key column if ticker is found
                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)
                
                all_news.append(df)  # Append DataFrame to the list
                print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
                current_date = batch_end_date + timedelta(days=1)
            elif response.status_code == 429:
                print("Rate limit reached. Waiting to retry...")
                time.sleep(60)  # Wait for 60 seconds or as recommended by the API
                continue  # Retry the current request
            else:
                print(f"Failed to fetch data: {response.status_code}, {response.text}")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    return pd.concat(all_news, ignore_index=True)

#Usage
api_key = os.environ.get('API_NEWS')
ticker = 'NVDA'
end_date = datetime.now() - timedelta(days=1)  # Yesterday's date
start_date = end_date - timedelta(days=365 * 2)
news_articles = fetch_news(api_key, ticker, start_date, end_date)
print(f"Total articles fetched: {len(news_articles)}")

In [2]:
import os
import requests
from datetime import datetime, timedelta
import pandas as pd
import time

def fetch_news(api_key, ticker, start_date, end_date):
    base_url = "https://api.polygon.io/v2/reference/news"
    all_news = []
    
    current_date = start_date

    while current_date <= end_date:
        batch_end_date = current_date + timedelta(days=50)
        if batch_end_date > end_date:
            batch_end_date = end_date

        params = {
            "ticker": ticker,
            "published_utc.gte": current_date.strftime('%Y-%m-%d'),
            "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
            "limit": 50,
            "sort": "published_utc",
            "apiKey": api_key  # Include the API key as a query parameter
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data.get('results', [])
                
                if articles:
                    # Creating a DataFrame from articles
                    df = pd.DataFrame(articles)
                    
                    # Adding ticker column
                    df['ticker'] = ticker
                    
                    all_news.append(df)  # Append DataFrame to the list
                    print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
                else:
                    print(f"No articles found from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
                    
                current_date = batch_end_date + timedelta(days=1)
            elif response.status_code == 429:
                print("Rate limit reached. Waiting to retry...")
                time.sleep(60)  # Wait for 60 seconds or as recommended by the API
                continue  # Retry the current request
            else:
                print(f"Failed to fetch data: {response.status_code}, {response.text}")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    if all_news:
        return pd.concat(all_news, ignore_index=True)
    else:
        print("No news articles were fetched.")
        return pd.DataFrame()

# Usage
api_key = os.environ.get('API_NEWS')
ticker = 'NVDA'
end_date = datetime.now() - timedelta(days=1)  # Yesterday's date
start_date = end_date - timedelta(days=365 * 2)
news_articles = fetch_news(api_key, ticker, start_date, end_date)
print(f"Total articles fetched: {len(news_articles)}")


Fetched 50 articles from 2022-08-05 to 2022-09-24
Fetched 50 articles from 2022-09-25 to 2022-11-14
Fetched 50 articles from 2022-11-15 to 2023-01-04
Fetched 50 articles from 2023-01-05 to 2023-02-24
Fetched 50 articles from 2023-02-25 to 2023-04-16
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-04-17 to 2023-06-06
Fetched 50 articles from 2023-06-07 to 2023-07-27
Fetched 50 articles from 2023-07-28 to 2023-09-16
Fetched 50 articles from 2023-09-17 to 2023-11-06
Fetched 50 articles from 2023-11-07 to 2023-12-27
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-12-28 to 2024-02-16
Fetched 50 articles from 2024-02-17 to 2024-04-07
Fetched 50 articles from 2024-04-08 to 2024-05-28
Fetched 50 articles from 2024-05-29 to 2024-07-18
Fetched 50 articles from 2024-07-19 to 2024-08-04
Total articles fetched: 750


In [3]:
# Process the news articles
df = process_news_articles(news_articles)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       66 non-null     object 
 1   ticker     66 non-null     object 
 2   sentiment  66 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.7+ KB


In [5]:
df.head()

Unnamed: 0,date,ticker,sentiment
0,2022-09-20,NVDA,0.126767
1,2022-09-21,NVDA,0.162648
2,2022-09-22,NVDA,0.262075
3,2022-09-23,NVDA,0.130787
4,2022-09-24,NVDA,0.33941


In [6]:
df= df.sort_index(ascending=False)

In [7]:
print(df.head())
print(df.info())


          date ticker  sentiment
65  2024-08-04   NVDA   0.022477
64  2024-08-03   NVDA  -0.001464
63  2024-08-02   NVDA   0.061445
62  2024-08-01   NVDA   0.165074
61  2024-07-31   NVDA   0.162192
<class 'pandas.core.frame.DataFrame'>
Index: 66 entries, 65 to 0
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       66 non-null     object 
 1   ticker     66 non-null     object 
 2   sentiment  66 non-null     float64
dtypes: float64(1), object(2)
memory usage: 2.1+ KB
None


In [8]:
#Putting the news articles into a csv
df.to_csv('news_articles.csv', index=False)

In [9]:
df_processed = exponential_moving_average(df, window=7)

In [10]:
df_processed.to_csv('news_articles_ema.csv', index=False)

In [11]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
65,2024-08-04,NVDA,0.022477,0.022477
64,2024-08-03,NVDA,-0.001464,0.016492
63,2024-08-02,NVDA,0.061445,0.02773
62,2024-08-01,NVDA,0.165074,0.062066
61,2024-07-31,NVDA,0.162192,0.087098


In [12]:
df_processed.tail()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
4,2022-09-24,NVDA,0.33941,0.20033
3,2022-09-23,NVDA,0.130787,0.182944
2,2022-09-22,NVDA,0.262075,0.202727
1,2022-09-21,NVDA,0.162648,0.192707
0,2022-09-20,NVDA,0.126767,0.176222


In [13]:
print(df_processed['date'].min())
print(df_processed['date'].max())

2022-09-20
2024-08-04


In [14]:
print(df_processed['date'].max() - df_processed['date'].min()) 

684 days, 0:00:00


In [15]:
df_processed.shape

(66, 4)

In [16]:
duplicates = df_processed[df_processed.duplicated('date')]

In [17]:
duplicates.shape

(0, 4)

In [18]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
65,2024-08-04,NVDA,0.022477,0.022477
64,2024-08-03,NVDA,-0.001464,0.016492
63,2024-08-02,NVDA,0.061445,0.02773
62,2024-08-01,NVDA,0.165074,0.062066
61,2024-07-31,NVDA,0.162192,0.087098
