In [22]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from newsapi import NewsApiClient
from newspaper import Article
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import yfinance as yf

# ---------- SETTINGS ----------
NEWS_API_KEY = '180ccde2a0d942048f65a588d9d03470'
COMPANY_NAME = 'Apple'
TICKER = 'AAPL'
DAYS = 28
# -------------------------------

# ========== 1. Fetch News ==========
newsapi = NewsApiClient(api_key=NEWS_API_KEY)
to_date = datetime.now().strftime('%Y-%m-%d')
from_date = (datetime.now() - timedelta(days=DAYS)).strftime('%Y-%m-%d')

def fetch_articles(company_name):
    try:
        articles = newsapi.get_everything(
            q=company_name,
            from_param=from_date,
            to=to_date,
            language='en',
            page_size=100,
            sort_by='relevancy'
        )
        return articles['articles']
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def scrape_full_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Failed to fetch article at {url}: {e}")
        return None

def collect_news_data(company_name):
    raw_articles = fetch_articles(company_name)
    processed = []

    for article in raw_articles:
        full_content = scrape_full_content(article['url'])
        if full_content:
            processed.append({
                'company': company_name,
                'title': article['title'],
                'description': article['description'],
                'url': article['url'],
                'published_at': article['publishedAt'],
                'content': full_content
            })

    return pd.DataFrame(processed)

articles_df = collect_news_data(COMPANY_NAME)
articles_df['published_at'] = pd.to_datetime(articles_df['published_at']).dt.date

Failed to fetch article at https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_2a6879d4-7515-47ba-a2fa-b07e2fb46f9b: Article `download()` failed with 401 Client Error: Unauthorized for url: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_2a6879d4-7515-47ba-a2fa-b07e2fb46f9b on URL https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_2a6879d4-7515-47ba-a2fa-b07e2fb46f9b


In [23]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    if isinstance(text, str):
        scores = analyzer.polarity_scores(text)
        return pd.Series({
            'sentiment_polarity': scores['compound'],
            'sentiment_neg': scores['neg'],
            'sentiment_neu': scores['neu'],
            'sentiment_pos': scores['pos']
        })
    return pd.Series({'sentiment_polarity': 0.0, 'sentiment_neg': 0.0, 'sentiment_neu': 0.0, 'sentiment_pos': 0.0})

sentiment_scores = articles_df['content'].apply(analyze_sentiment)
articles_df = pd.concat([articles_df, sentiment_scores], axis=1)

In [None]:
# Sort by published date (descending)
articles_df = articles_df.sort_values(by='published_at', ascending=False)

# Check how many unique days are covered
unique_dates = articles_df['published_at'].nunique()
print(f"  News covers {unique_dates} unique days.")

# Show a breakdown of article count per day
print("\n   Articles per day:")
print(articles_df['published_at'].value_counts().sort_index(ascending=False))

# Save to CSV
articles_df.to_csv("apple_news_last_28_days_sorted.csv", index=False)
print("   News data saved to 'apple_news_last_28_days_sorted.csv'")

In [25]:
def compute_rsi(series, period=14):
    delta = series.diff()
    gains = delta.where(delta > 0, 0.0)
    losses = -delta.where(delta < 0, 0.0)
    avg_gain = gains.rolling(window=period).mean()
    avg_loss = losses.rolling(window=period).mean()
    rs = avg_gain / (avg_loss + 1e-9)
    return 100.0 - (100.0 / (1.0 + rs))

def compute_macd(series, fastperiod=12, slowperiod=26, signalperiod=9):
    ema_fast = series.ewm(span=fastperiod, adjust=False).mean()
    ema_slow = series.ewm(span=slowperiod, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signalperiod, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return macd_line, signal_line, macd_hist

def compute_ma(series, window=20):
    return series.rolling(window=window).mean()

def get_stock_data(ticker):
    df = yf.Ticker(ticker).history(period=f"{60+1}d")[['Close']]
    df.rename(columns={"Close": "Close_Price"}, inplace=True)
    df['RSI'] = compute_rsi(df['Close_Price'])
    df['MACD_line'], df['MACD_signal'], df['MACD_hist'] = compute_macd(df['Close_Price'])
    df['MA20'] = compute_ma(df['Close_Price'])
    df = df.dropna()
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].dt.date
    return df

stock_df = get_stock_data(TICKER)

In [None]:
# Fetch stock data as you already defined
stock_df = get_stock_data(TICKER)

# Get unique trading dates from stock_df
trading_dates = set(stock_df['Date'])

# Get unique news dates from articles_df
news_dates = set(articles_df['published_at'])

# Find common dates (intersection)
valid_dates = sorted(trading_dates.intersection(news_dates))
print(f"   {len(valid_dates)} overlapping dates with both trading and news.")

# Filter stock and news data
filtered_stock_df = stock_df[stock_df['Date'].isin(valid_dates)].copy()
filtered_news_df = articles_df[articles_df['published_at'].isin(valid_dates)].copy()

# Optional: Save filtered versions
filtered_stock_df.to_csv("apple_stock_filtered_by_news.csv", index=False)
filtered_news_df.to_csv("apple_news_filtered_by_trading.csv", index=False)

print(" Saved filtered stock and news data based on common trading days.")

In [27]:
stock_df

Unnamed: 0,Date,Close_Price,RSI,MACD_line,MACD_signal,MACD_hist,MA20
0,2025-01-24,222.535324,23.877967,-8.432157,-6.877686,-1.554472,239.042178
1,2025-01-27,229.607544,35.613786,-7.909068,-7.083962,-0.825106,237.626733
2,2025-01-28,237.998322,43.722045,-6.739759,-7.015122,0.275363,236.589874
3,2025-01-29,239.097122,47.259079,-5.659174,-6.743932,1.084759,235.779266
4,2025-01-30,237.329056,45.203682,-4.889111,-6.372968,1.483857,235.049567
5,2025-01-31,235.740814,49.132837,-4.356767,-5.969728,1.612961,234.329359
6,2025-02-03,227.759583,44.14299,-4.526718,-5.681126,1.154408,233.538229
7,2025-02-04,232.544327,49.58778,-4.226595,-5.39022,1.163625,233.010809
8,2025-02-05,232.214691,44.996304,-3.969585,-5.106093,1.136508,232.384997
9,2025-02-06,232.963867,55.498898,-3.663223,-4.817519,1.154295,231.93599


In [28]:
# Aggregate daily sentiment scores
sentiment_daily = articles_df.groupby('published_at')[[
    'sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos'
]].mean().reset_index().rename(columns={'published_at': 'Date'})

# Merge stock and sentiment
merged_df = pd.merge(stock_df, sentiment_daily, on='Date', how='left')

# Fill missing sentiment with 0 or previous day's value
merged_df[['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']] = (
    merged_df[['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']].fillna(0)
)

# Optional: add next day's price for supervised learning
merged_df['Close_next'] = merged_df['Close_Price'].shift(-1)
merged_df.dropna(inplace=True)

print(merged_df.head())

         Date  Close_Price        RSI  MACD_line  MACD_signal  MACD_hist  \
0  2025-01-24   222.535324  23.877967  -8.432157    -6.877686  -1.554472   
1  2025-01-27   229.607544  35.613786  -7.909068    -7.083962  -0.825106   
2  2025-01-28   237.998322  43.722045  -6.739759    -7.015122   0.275363   
3  2025-01-29   239.097122  47.259079  -5.659174    -6.743932   1.084759   
4  2025-01-30   237.329056  45.203682  -4.889111    -6.372968   1.483857   

         MA20  sentiment_polarity  sentiment_neg  sentiment_neu  \
0  239.042178                 0.0            0.0            0.0   
1  237.626733                 0.0            0.0            0.0   
2  236.589874                 0.0            0.0            0.0   
3  235.779266                 0.0            0.0            0.0   
4  235.049567                 0.0            0.0            0.0   

   sentiment_pos  Close_next  
0            0.0  229.607544  
1            0.0  237.998322  
2            0.0  239.097122  
3            0.0

In [None]:
merged_df.to_csv("apple_test_data_merged.csv", index=False)
print("   Apple test data saved to 'apple_test_data_merged.csv'")