In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER for sentiment analysis
nltk.download('vader_lexicon')

# Load financial news dataset
news_df = pd.read_csv("financial_news.csv")  # Replace with actual dataset path
news_df['date'] = pd.to_datetime(news_df['date'])  # Ensure date format

# Perform sentiment analysis
sia = SentimentIntensityAnalyzer()
news_df['sentiment'] = news_df['headline'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])

# Fetch stock price data (e.g., AAPL)
stock_df = yf.download("AAPL", start="2020-01-01", end="2023-01-01")
stock_df.reset_index(inplace=True)

# Merge sentiment scores with stock price data
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
merged_df = pd.merge(stock_df, news_df, left_on="Date", right_on="date", how="left").fillna(0)

# Feature selection
X = merged_df[['Open', 'High', 'Low', 'Volume', 'sentiment']]
y = merged_df['Close']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")
