In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from newsapi import NewsApiClient
from newspaper import Article
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import yfinance as yf

# ---------- SETTINGS ----------
NEWS_API_KEY = '180ccde2a0d942048f65a588d9d03470'
COMPANY_NAME = 'META Platforms, Facebook.'
TICKER = 'META'
DAYS = 28
# -------------------------------

# ========== 1. Fetch News ==========
newsapi = NewsApiClient(api_key=NEWS_API_KEY)
to_date = datetime.now().strftime('%Y-%m-%d')
from_date = (datetime.now() - timedelta(days=DAYS)).strftime('%Y-%m-%d')

def fetch_articles(company_name):
    try:
        articles = newsapi.get_everything(
            q=company_name,
            from_param=from_date,
            to=to_date,
            language='en',
            page_size=100,
            sort_by='relevancy'
        )
        return articles['articles']
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def scrape_full_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Failed to fetch article at {url}: {e}")
        return None

def collect_news_data(company_name):
    raw_articles = fetch_articles(company_name)
    processed = []

    for article in raw_articles:
        full_content = scrape_full_content(article['url'])
        if full_content:
            processed.append({
                'company': company_name,
                'title': article['title'],
                'description': article['description'],
                'url': article['url'],
                'published_at': article['publishedAt'],
                'content': full_content
            })

    return pd.DataFrame(processed)

articles_df = collect_news_data(COMPANY_NAME)
articles_df['published_at'] = pd.to_datetime(articles_df['published_at']).dt.date



Failed to fetch article at https://www.androidpolice.com/facebook-revamps-friends-tab/: Article `download()` failed with ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) on URL https://www.androidpolice.com/facebook-revamps-friends-tab/
Failed to fetch article at https://www.androidpolice.com/new-protections-instagram-teen-accounts-facebook-messenger/: Article `download()` failed with ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) on URL https://www.androidpolice.com/new-protections-instagram-teen-accounts-facebook-messenger/
Failed to fetch article at https://www.androidpolice.com/im-not-buying-meta-glasses/: Article `download()` failed with ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) on URL https://www.androidpolice.com/im-not-buying-meta-glasses/
Failed to fetch article at https://www.androidpolice.com/what-is-a-super-app/: Article `download()` f



Failed to fetch article at https://www.forbes.com/sites/saibala/2025/03/31/meta-ai-has-one-big-advantage-over-chatgpt-and-others/: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/saibala/2025/03/31/meta-ai-has-one-big-advantage-over-chatgpt-and-others/ on URL https://www.forbes.com/sites/saibala/2025/03/31/meta-ai-has-one-big-advantage-over-chatgpt-and-others/
Failed to fetch article at https://www.techdirt.com/2025/04/02/trumps-buddies-at-andreesen-horowitz-want-to-help-buy-tiktok-turn-it-into-a-right-wing-safe-space/: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.techdirt.com/2025/04/02/trumps-buddies-at-andreesen-horowitz-want-to-help-buy-tiktok-turn-it-into-a-right-wing-safe-space/ on URL https://www.techdirt.com/2025/04/02/trumps-buddies-at-andreesen-horowitz-want-to-help-buy-tiktok-turn-it-into-a-right-wing-safe-space/


In [2]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    if isinstance(text, str):
        scores = analyzer.polarity_scores(text)
        return pd.Series({
            'sentiment_polarity': scores['compound'],
            'sentiment_neg': scores['neg'],
            'sentiment_neu': scores['neu'],
            'sentiment_pos': scores['pos']
        })
    return pd.Series({'sentiment_polarity': 0.0, 'sentiment_neg': 0.0, 'sentiment_neu': 0.0, 'sentiment_pos': 0.0})

sentiment_scores = articles_df['content'].apply(analyze_sentiment)
articles_df = pd.concat([articles_df, sentiment_scores], axis=1)

In [3]:
# Sort by published date (descending)
articles_df = articles_df.sort_values(by='published_at', ascending=False)

# Check how many unique days are covered
unique_dates = articles_df['published_at'].nunique()
print(f" News covers {unique_dates} unique days.")

# Show a breakdown of article count per day
print("\n Articles per day:")
print(articles_df['published_at'].value_counts().sort_index(ascending=False))

# Save to CSV
articles_df.to_csv("META_news_last_28_days_sorted.csv", index=False)
print(" News data saved to 'META_news_last_28_days_sorted.csv'")

 News covers 23 unique days.

 Articles per day:
published_at
2025-04-23     8
2025-04-22     5
2025-04-21     2
2025-04-19     1
2025-04-18     2
2025-04-17     5
2025-04-16     5
2025-04-15    10
2025-04-14     9
2025-04-13     2
2025-04-11     1
2025-04-10     3
2025-04-09     4
2025-04-08     9
2025-04-07     3
2025-04-06     1
2025-04-04     3
2025-04-03     2
2025-04-02     2
2025-04-01     4
2025-03-31     2
2025-03-28     3
2025-03-27     2
Name: count, dtype: int64
 News data saved to 'META_news_last_28_days_sorted.csv'


In [4]:
def compute_rsi(series, period=14):
    delta = series.diff()
    gains = delta.where(delta > 0, 0.0)
    losses = -delta.where(delta < 0, 0.0)
    avg_gain = gains.rolling(window=period).mean()
    avg_loss = losses.rolling(window=period).mean()
    rs = avg_gain / (avg_loss + 1e-9)
    return 100.0 - (100.0 / (1.0 + rs))

def compute_macd(series, fastperiod=12, slowperiod=26, signalperiod=9):
    ema_fast = series.ewm(span=fastperiod, adjust=False).mean()
    ema_slow = series.ewm(span=slowperiod, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signalperiod, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return macd_line, signal_line, macd_hist

def compute_ma(series, window=20):
    return series.rolling(window=window).mean()

def get_stock_data(ticker):
    df = yf.Ticker(ticker).history(period=f"{60+1}d")[['Close']]
    df.rename(columns={"Close": "Close_Price"}, inplace=True)
    df['RSI'] = compute_rsi(df['Close_Price'])
    df['MACD_line'], df['MACD_signal'], df['MACD_hist'] = compute_macd(df['Close_Price'])
    df['MA20'] = compute_ma(df['Close_Price'])
    df = df.dropna()
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].dt.date
    return df

stock_df = get_stock_data(TICKER)

In [5]:
# Fetch stock data as you already defined
stock_df = get_stock_data(TICKER)

# Get unique trading dates from stock_df
trading_dates = set(stock_df['Date'])

# Get unique news dates from articles_df
news_dates = set(articles_df['published_at'])

# Find common dates (intersection)
valid_dates = sorted(trading_dates.intersection(news_dates))
print(f" {len(valid_dates)} overlapping dates with both trading and news.")

# Filter stock and news data
filtered_stock_df = stock_df[stock_df['Date'].isin(valid_dates)].copy()
filtered_news_df = articles_df[articles_df['published_at'].isin(valid_dates)].copy()

# Optional: Save filtered versions
filtered_stock_df.to_csv("META_stock_filtered_by_news.csv", index=False)
filtered_news_df.to_csv("META_news_filtered_by_trading.csv", index=False)

print(" Saved filtered stock and news data based on common trading days.")

 19 overlapping dates with both trading and news.
 Saved filtered stock and news data based on common trading days.


In [6]:
stock_df

Unnamed: 0,Date,Close_Price,RSI,MACD_line,MACD_signal,MACD_hist,MA20
0,2025-02-25,656.915588,29.090902,-0.468426,6.658512,-7.126937,699.977258
1,2025-02-26,673.101196,37.744758,-1.994835,4.927842,-6.922677,699.945789
2,2025-02-27,657.654907,30.167526,-4.400189,3.062236,-7.462425,699.0341
3,2025-02-28,667.606079,33.797409,-5.440756,1.361638,-6.802393,698.094937
4,2025-03-03,654.467712,29.652112,-7.242086,-0.359107,-6.882979,696.389954
5,2025-03-04,639.431091,25.943588,-9.770357,-2.241357,-7.529,693.519504
6,2025-03-05,655.886475,30.506377,-10.327172,-3.85852,-6.468652,691.135623
7,2025-03-06,627.371826,25.105148,-12.920408,-5.670898,-7.24951,687.292041
8,2025-03-07,625.103821,21.72009,-14.985828,-7.533884,-7.451944,682.979376
9,2025-03-10,597.458435,20.934017,-18.638585,-9.754824,-8.883761,677.158054


In [7]:
# Aggregate daily sentiment scores
sentiment_daily = articles_df.groupby('published_at')[[
    'sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos'
]].mean().reset_index().rename(columns={'published_at': 'Date'})

# Merge stock and sentiment
merged_df = pd.merge(stock_df, sentiment_daily, on='Date', how='left')

# Fill missing sentiment with 0 or previous day's value
merged_df[['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']] = (
    merged_df[['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']].fillna(0)
)

# Optional: add next day's price for supervised learning
merged_df['Close_next'] = merged_df['Close_Price'].shift(-1)
merged_df.dropna(inplace=True)

print(merged_df.head())

         Date  Close_Price        RSI  MACD_line  MACD_signal  MACD_hist  \
0  2025-02-25   656.915588  29.090902  -0.468426     6.658512  -7.126937   
1  2025-02-26   673.101196  37.744758  -1.994835     4.927842  -6.922677   
2  2025-02-27   657.654907  30.167526  -4.400189     3.062236  -7.462425   
3  2025-02-28   667.606079  33.797409  -5.440756     1.361638  -6.802393   
4  2025-03-03   654.467712  29.652112  -7.242086    -0.359107  -6.882979   

         MA20  sentiment_polarity  sentiment_neg  sentiment_neu  \
0  699.977258                 0.0            0.0            0.0   
1  699.945789                 0.0            0.0            0.0   
2  699.034100                 0.0            0.0            0.0   
3  698.094937                 0.0            0.0            0.0   
4  696.389954                 0.0            0.0            0.0   

   sentiment_pos  Close_next  
0            0.0  673.101196  
1            0.0  657.654907  
2            0.0  667.606079  
3            0.0

In [8]:
merged_df.to_csv("META_test_data_merged.csv", index=False)
print(" META test data saved to 'META_test_data_merged.csv'")

 META test data saved to 'META_test_data_merged.csv'
