#Dataset creation

## Installation of library for data pull

In [7]:
pip install duckdb pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Data pull

In [8]:
import duckdb
import json
from datetime import datetime
import numpy as np # Import numpy

# -----------------------------
# CONFIG
# -----------------------------
TICKER = "CAT"        # <<< CHANGE THIS
YEARS_BACK = 5
OUTPUT_FILE = f"{TICKER}_yahoo_finance_news_5yr.json"

PARQUET_URL = (
    "https://huggingface.co/datasets/"
    "bwzheng2010/yahoo-finance-data/resolve/main/data/stock_news.parquet"
)

# -----------------------------
# DATE CUTOFF
# -----------------------------
cutoff_year = datetime.now().year - YEARS_BACK
cutoff_date = f"{cutoff_year}-01-01"

# -----------------------------
# QUERY
# -----------------------------
print(f"Fetching news for ticker: {TICKER}")

con = duckdb.connect()

query = f"""
SELECT
    report_date,
    title,
    publisher,
    related_symbols,
    type,
    link
FROM read_parquet('{PARQUET_URL}')
WHERE
    report_date::DATE >= DATE '{cutoff_date}'
    AND list_contains(string_split(related_symbols::VARCHAR, ','), '{TICKER}')
ORDER BY report_date DESC
"""

df = con.execute(query).fetchdf()

# -----------------------------
# BUILD FEED
# -----------------------------
feed = []

for _, row in df.iterrows():
    symbols_data = row["related_symbols"]
    # Convert numpy arrays to lists for JSON serialization
    if isinstance(symbols_data, np.ndarray):
        symbols_data = symbols_data.tolist()
    # Ensure it's a list even if it was None or another non-iterable type
    elif symbols_data is None:
        symbols_data = []
    # If it's already a list, it remains a list

    feed.append({
        "date": str(row["report_date"]),
        "ticker": TICKER,
        "title": row["title"],
        "publisher": row["publisher"],
        "symbols": symbols_data,
        "type": row["type"],
        "url": row["link"]
    })

# -----------------------------
# SAVE
# -----------------------------
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(feed, f, indent=2, ensure_ascii=False)

print(f"SUCCESS: {len(feed)} articles saved → {OUTPUT_FILE}")


Fetching news for ticker: CAT


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

SUCCESS: 495 articles saved → CAT_yahoo_finance_news_5yr.json


## Conversion of data into xml/feeds

In [9]:
"""
Convert Yahoo Finance JSON news to Yahoo-style RSS feed
"""

import json
from datetime import datetime
from email.utils import format_datetime
import xml.etree.ElementTree as ET

# -----------------------------
# CONFIG
# -----------------------------
TICKER = "CAT"
INPUT_JSON = f"{TICKER}_yahoo_finance_news_5yr.json"

OUTPUT_RSS = f"{TICKER}.rss.xml"

CHANNEL_TITLE = f"{TICKER} News – Yahoo Finance"
CHANNEL_LINK = f"https://finance.yahoo.com/quote/{TICKER}"
CHANNEL_DESCRIPTION = f"Latest news headlines for {TICKER}"
LANGUAGE = "en-us"

# -----------------------------
# LOAD JSON
# -----------------------------
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    news_items = json.load(f)

# -----------------------------
# BUILD RSS
# -----------------------------
rss = ET.Element("rss", version="2.0")
channel = ET.SubElement(rss, "channel")

ET.SubElement(channel, "title").text = CHANNEL_TITLE
ET.SubElement(channel, "link").text = CHANNEL_LINK
ET.SubElement(channel, "description").text = CHANNEL_DESCRIPTION
ET.SubElement(channel, "language").text = LANGUAGE
ET.SubElement(channel, "lastBuildDate").text = format_datetime(datetime.utcnow())

# -----------------------------
# ADD ITEMS
# -----------------------------
for item in news_items:
    rss_item = ET.SubElement(channel, "item")

    ET.SubElement(rss_item, "title").text = item["title"]
    ET.SubElement(rss_item, "link").text = item["url"]

    guid = ET.SubElement(rss_item, "guid", isPermaLink="true")
    guid.text = item["url"]

    # Convert YYYY-MM-DD → RFC 2822
    pub_date = datetime.strptime(item["date"], "%Y-%m-%d")
    ET.SubElement(rss_item, "pubDate").text = format_datetime(pub_date)

    source = ET.SubElement(rss_item, "source")
    source.text = item["publisher"]

# -----------------------------
# WRITE FILE
# -----------------------------
tree = ET.ElementTree(rss)
tree.write(OUTPUT_RSS, encoding="utf-8", xml_declaration=True)

print(f"SUCCESS: RSS feed written to {OUTPUT_RSS}")


SUCCESS: RSS feed written to CAT.rss.xml


  ET.SubElement(channel, "lastBuildDate").text = format_datetime(datetime.utcnow())


## Data cleaning and formatting

In [10]:
%%writefile complete_data_collection.py

import pandas as pd
import yfinance as yf
import os
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

ticker = "CAT"

def collect_news_data(ticker='CAT', max_items=5000):
    """
    Reads news data from a local RSS feed file (.rss.xml).
    """
    print(f"Collecting news data from local file '{ticker}.rss.xml'...")

    rss_file_name = f"{ticker}.rss.xml" # Assuming the user uploaded this file

    try:
        # Check if the file exists
        if not os.path.exists(rss_file_name):
            print(f"Error: RSS file '{rss_file_name}' not found in the current directory. Please upload it.")
            return pd.DataFrame(columns=['Source', 'Headline', 'PubDate'])

        # Read the content of the local RSS file
        with open(rss_file_name, 'r', encoding='utf-8') as f:
            rss_content = f.read()

        soup = BeautifulSoup(rss_content, "xml") # Parse the content
        items = soup.find_all("item")

        data = []
        for i, item in enumerate(items[:max_items], 1):
            try:
                pub_date = item.pubDate.text if item.pubDate else None
                headline = item.title.text if item.title else None
                source = item.link.text if item.link else None

                data.append({
                    'Source': source,
                    'Headline': headline,
                    'PubDate': pub_date
                })
            except Exception as e:
                print(f"Error parsing item {i}: {e}")
                continue

        df = pd.DataFrame(data)
        print(f"Successfully collected {len(df)} news items from '{rss_file_name}'")
        return df

    except Exception as e:
        print(f"Error collecting news data from local file '{rss_file_name}': {e}")
        return pd.DataFrame(columns=['Source', 'Headline', 'PubDate'])


def clean_news_data(df):
    """
    Clean and process news data
    """
    print("Cleaning news data...")

    if df.empty:
        print("Warning: Empty dataframe provided")
        return df

    # Parse publication dates
    df['PubDate_Clean'] = df['PubDate'].str.split(' ').str[1:4].str.join(" ")
    df['PubDate'] = pd.to_datetime(df['PubDate_Clean'], format="%d %b %Y", errors='coerce')
    df['Date'] = df['PubDate'].dt.date

    # Calculate headline length
    df['Headline Length'] = df['Headline'].str.len()

    # Drop temporary column
    df.drop('PubDate_Clean', axis=1, inplace=True, errors='ignore')

    print(f"Cleaned data: {len(df)} rows with dates from {df['Date'].min()} to {df['Date'].max()}")
    return df


def collect_stock_data(ticker='CAT', period='5y'):
    """
    Collect stock price data using yfinance
    """
    print(f"Collecting stock data for {ticker}...")

    try:
        stock = yf.Ticker(ticker)
        df = stock.history(period=period)

        if df.empty:
            print("Warning: No stock data retrieved")
            return pd.DataFrame()

        # Select relevant columns
        df_stock = df[["Open", "High", "Low", "Close", "Volume"]].copy()

        # Add date column
        df_stock.index = pd.to_datetime(df_stock.index)
        df_stock['Date'] = df_stock.index.date
        df_stock.index = df_stock.index.date

        print(f"Successfully collected {len(df_stock)} days of stock data")
        return df_stock

    except Exception as e:
        print(f"Error collecting stock data: {e}")
        return pd.DataFrame()


def merge_datasets(stock_df, news_df):
    """
    Merge stock and news data on date
    """
    print("Merging datasets...")

    if stock_df.empty or news_df.empty:
        print("Warning: One or both dataframes are empty")
        if not stock_df.empty:
            return stock_df
        return pd.DataFrame()

    # Perform left merge to keep all trading days
    merged_df = stock_df.merge(news_df, on='Date', how='left')

    print(f"Merged dataset: {len(merged_df)} rows")
    print(f"News coverage: {merged_df['Headline'].notna().sum()} days have news")

    return merged_df


def save_datasets(news_raw, news_clean, stock_data, merged_data):
    """
    Save all datasets to CSV files
    """
    print("\nSaving datasets...")

    try:
        if not news_raw.empty:
            news_raw.to_csv('News_raw.csv', index=False)
            print("✓ Saved News_raw.csv")

        if not news_clean.empty:
            news_clean.to_csv('news_cleaned.csv', index=False)
            print("✓ Saved news_cleaned.csv")

        if not stock_data.empty:
            stock_data.to_csv('stock_data.csv')
            print("✓ Saved stock_data.csv")

        if not merged_data.empty:
            merged_data.to_csv('merged_medsem_data.csv', index=False)
            print("✓ Saved merged_medsem_data.csv")

    except Exception as e:
        print(f"Error saving files: {e}")


def generate_summary_statistics(merged_df):
    """
    Generate summary statistics for the report
    """
    print("\n" + "="*60)
    print("DATASET SUMMARY STATISTICS")
    print("="*60)

    if merged_df.empty:
        print("No data to summarize")
        return

    print(f"\nDate Range: {merged_df['Date'].min()} to {merged_df['Date'].max()}")
    print(f"Total Trading Days: {len(merged_df)}")
    print(f"Days with News: {merged_df['Headline'].notna().sum()}")
    print(f"Days without News: {merged_df['Headline'].isna().sum()}")

    print(f"\nStock Price Statistics:")
    print(f"  Opening Price Range: ${merged_df['Open'].min():.2f} - ${merged_df['Open'].max():.2f}")
    print(f"  Closing Price Range: ${merged_df['Close'].min():.2f} - ${merged_df['Close'].max():.2f}")
    print(f"  Average Daily Volume: {merged_df['Volume'].mean():,.0f}")

    if 'Headline Length' in merged_df.columns:
        print(f"\nNews Statistics:")
        print(f"  Average Headline Length: {merged_df['Headline Length'].mean():.0f} characters")
        print(f"  Shortest Headline: {merged_df['Headline Length'].min():.0f} characters")
        print(f"  Longest Headline: {merged_df['Headline Length'].max():.0f} characters")

    print("\n" + "="*60)


def main():
    """
    Main execution function
    """
    print("="*60)
    print("WiDS 2025 - Stock Market & News Sentiment Analysis")
    print("Data Collection Script")
    print("="*60 + "\n")

    # Configuration
    TICKER = 'CAT'
    PERIOD = '5y'  # Can change to '3mo', '6mo', etc.

    # Step 1: Collect news data
    news_raw = collect_news_data(ticker=TICKER, max_items=5000)

    # Step 2: Clean news data
    news_clean = clean_news_data(news_raw.copy()) if not news_raw.empty else pd.DataFrame()

    # Step 3: Collect stock data
    stock_data = collect_stock_data(ticker=TICKER, period=PERIOD) # Corrected variable name

    # Step 4: Merge datasets
    merged_data = merge_datasets(stock_data, news_clean)

    # Step 5: Save all datasets
    save_datasets(news_raw, news_clean, stock_data, merged_data)

    # Step 6: Generate summary
    generate_summary_statistics(merged_data)

    print("\n✓ Data collection complete!")
    print("\nGenerated files:")
    print("  - News_raw.csv")
    print("  - news_cleaned.csv")
    print("  - stock_data.csv")
    print("  - merged_medsem_data.csv")


if __name__ == "__main__":
    main()



Writing complete_data_collection.py


# ML models and features

In [11]:
%%writefile week4_sentiment_features.py

"""
Week 4: Sentiment Analysis & Feature Engineering
WiDS 2025 Project - Stock Volatility Prediction

This script:
1. Performs sentiment analysis on news headlines
2. Engineers features from sentiment scores
3. Prepares data for machine learning models
"""

import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')

class SentimentAnalyzer:
    """Analyze sentiment of financial news headlines"""

    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()

    def get_sentiment_scores(self, text):
        """
        Get VADER sentiment scores for text
        Returns: compound, positive, negative, neutral scores
        """
        if pd.isna(text):
            return 0.0, 0.0, 0.0, 0.0

        scores = self.analyzer.polarity_scores(str(text))
        return (
            scores['compound'],
            scores['pos'],
            scores['neg'],
            scores['neu']
        )

    def classify_sentiment(self, compound_score):
        """
        Classify sentiment as Positive, Negative, or Neutral
        Based on compound score thresholds
        """
        if compound_score >= 0.05:
            return 'Positive'
        elif compound_score <= -0.05:
            return 'Negative'
        else:
            return 'Neutral'


class FeatureEngineer:
    """Engineer features for stock prediction"""

    @staticmethod
    def calculate_price_change(df):
        """Calculate daily price change and direction"""
        df['Price_Change'] = df['Close'] - df['Open']
        df['Price_Change_Pct'] = (df['Price_Change'] / df['Open']) * 100
        df['Price_Direction'] = df['Price_Change'].apply(
            lambda x: 1 if x > 0 else 0
        )
        return df

    @staticmethod
    def calculate_volatility(df, window=5):
        """Calculate rolling volatility"""
        df['Volatility'] = df['Close'].pct_change().rolling(window=window).std()
        return df

    @staticmethod
    def calculate_momentum(df):
        """Calculate price momentum indicators"""
        df['Momentum_1d'] = df['Close'].pct_change(1)
        df['Momentum_3d'] = df['Close'].pct_change(3)
        df['Momentum_5d'] = df['Close'].pct_change(5)
        return df

    @staticmethod
    def calculate_ma(df, windows=[5, 10]):
        """Calculate moving averages"""
        for window in windows:
            df[f'MA_{window}'] = df['Close'].rolling(window=window).mean()
        return df

    @staticmethod
    def aggregate_daily_sentiment(df):
        """
        Aggregate sentiment scores for days with multiple news
        """
        sentiment_agg = df.groupby('Date').agg({
            'Sentiment_Compound': ['mean', 'std', 'count'],
            'Sentiment_Positive': 'mean',
            'Sentiment_Negative': 'mean',
            'Sentiment_Neutral': 'mean'
        })

        # Flatten column names
        sentiment_agg.columns = [
            'Sentiment_Mean', 'Sentiment_Std', 'News_Count',
            'Positive_Mean', 'Negative_Mean', 'Neutral_Mean'
        ]

        # Fill std with 0 for days with single news
        sentiment_agg['Sentiment_Std'] = sentiment_agg['Sentiment_Std'].fillna(0)

        return sentiment_agg


def add_sentiment_features(news_df):
    """
    Add sentiment analysis features to news dataframe

    Args:
        news_df: DataFrame with 'Headline' column

    Returns:
        DataFrame with sentiment features added
    """
    print("Performing sentiment analysis...")

    analyzer = SentimentAnalyzer()

    # Apply sentiment analysis to each headline
    sentiment_results = news_df['Headline'].apply(
        lambda x: analyzer.get_sentiment_scores(x)
    )

    # Unpack results into separate columns
    news_df['Sentiment_Compound'] = sentiment_results.apply(lambda x: x[0])
    news_df['Sentiment_Positive'] = sentiment_results.apply(lambda x: x[1])
    news_df['Sentiment_Negative'] = sentiment_results.apply(lambda x: x[2])
    news_df['Sentiment_Neutral'] = sentiment_results.apply(lambda x: x[3])

    # Add sentiment classification
    news_df['Sentiment_Label'] = news_df['Sentiment_Compound'].apply(
        analyzer.classify_sentiment
    )

    print(f"✓ Sentiment analysis complete for {len(news_df)} headlines")

    # Print sentiment distribution
    print("\nSentiment Distribution:")
    print(news_df['Sentiment_Label'].value_counts())
    print(f"\nAverage Sentiment Score: {news_df['Sentiment_Compound'].mean():.3f}")

    return news_df


def engineer_stock_features(stock_df):
    """
    Add technical indicators and features to stock data

    Args:
        stock_df: DataFrame with OHLCV data

    Returns:
        DataFrame with engineered features
    """
    print("\nEngineering stock features...")

    engineer = FeatureEngineer()

    # Price-based features
    stock_df = engineer.calculate_price_change(stock_df)
    stock_df = engineer.calculate_volatility(stock_df)
    stock_df = engineer.calculate_momentum(stock_df)
    stock_df = engineer.calculate_ma(stock_df, windows=[5, 10])

    # Volume features
    stock_df['Volume_MA5'] = stock_df['Volume'].rolling(window=5).mean()
    stock_df['Volume_Change'] = stock_df['Volume'].pct_change()

    print(f"✓ Engineered {len([c for c in stock_df.columns if c not in ['Open', 'High', 'Low', 'Close', 'Volume', 'Date']])} new features")

    return stock_df


def create_ml_dataset(stock_df, news_df):
    """
    Merge stock and sentiment data, create features for ML

    Args:
        stock_df: DataFrame with stock features
        news_df: DataFrame with sentiment features

    Returns:
        DataFrame ready for machine learning
    """
    print("\nCreating ML-ready dataset...")

    # Ensure Date columns are datetime
    if 'Date' not in stock_df.columns and stock_df.index.name == 'Date':
        stock_df = stock_df.reset_index()
    if 'Date' in news_df.columns:
        news_df['Date'] = pd.to_datetime(news_df['Date'])
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])

    # Aggregate sentiment by date
    engineer = FeatureEngineer()
    sentiment_agg = engineer.aggregate_daily_sentiment(news_df)

    # Merge with stock data
    ml_df = stock_df.merge(sentiment_agg, left_on='Date', right_index=True, how='left')

    # Fill missing sentiment values (days without news)
    sentiment_cols = ['Sentiment_Mean', 'Sentiment_Std', 'News_Count',
                     'Positive_Mean', 'Negative_Mean', 'Neutral_Mean']
    ml_df[sentiment_cols] = ml_df[sentiment_cols].fillna(0)

    # Create lagged features (previous day sentiment)
    ml_df['Sentiment_Lag1'] = ml_df['Sentiment_Mean'].shift(1)
    ml_df['News_Count_Lag1'] = ml_df['News_Count'].shift(1)

    # Create target variable (next day price direction)
    ml_df['Target_Next_Day'] = ml_df['Price_Direction'].shift(-1)

    # Drop rows with NaN in target or critical features
    ml_df = ml_df.dropna(subset=['Target_Next_Day', 'Volatility', 'MA_5'])

    print(f"✓ Created dataset with {len(ml_df)} samples and {len(ml_df.columns)} features")
    print(f"✓ Target distribution: {ml_df['Target_Next_Day'].value_counts().to_dict()}")

    return ml_df


def save_processed_data(news_df, stock_df, ml_df):
    """Save all processed datasets"""
    print("\nSaving processed datasets...")

    news_df.to_csv('news_with_sentiment.csv', index=False)
    print("✓ Saved news_with_sentiment.csv")

    stock_df.to_csv('stock_with_features.csv', index=False)
    print("✓ Saved stock_with_features.csv")

    ml_df.to_csv('ml_ready_dataset.csv', index=False)
    print("✓ Saved ml_ready_dataset.csv")


def generate_feature_summary(ml_df):
    """Generate summary statistics of features"""
    print("\n" + "="*60)
    print("FEATURE ENGINEERING SUMMARY")
    print("="*60)

    print(f"\nDataset Shape: {ml_df.shape}")
    print(f"Date Range: {ml_df['Date'].min()} to {ml_df['Date'].max()}")

    print("\nPrice Statistics:")
    print(f"  Average Daily Return: {ml_df['Price_Change_Pct'].mean():.2f}%")
    print(f"  Volatility (std): {ml_df['Volatility'].mean():.4f}")
    print(f"  Up Days: {(ml_df['Price_Direction'] == 1).sum()}")
    print(f"  Down Days: {(ml_df['Price_Direction'] == 0).sum()}")

    print("\nSentiment Statistics:")
    print(f"  Average Sentiment: {ml_df['Sentiment_Mean'].mean():.3f}")
    print(f"  Days with News: {(ml_df['News_Count'] > 0).sum()}")
    print(f"  Average News per Day: {ml_df['News_Count'].mean():.1f}")

    print("\nFeature Correlation with Target:")
    corr_cols = ['Sentiment_Mean', 'Price_Change_Pct', 'Volatility',
                 'Momentum_1d', 'Volume_Change']
    correlations = ml_df[corr_cols + ['Target_Next_Day']].corr()['Target_Next_Day'].drop('Target_Next_Day')
    for feature, corr in correlations.items():
        print(f"  {feature}: {corr:.3f}")

    print("\n" + "="*60)


def main():
    """Main execution function"""
    print("="*60)
    print("Week 4: Sentiment Analysis & Feature Engineering")
    print("="*60 + "\n")

    # Load data
    try:
        news_df = pd.read_csv('news_cleaned.csv')
        stock_df = pd.read_csv('stock_data.csv')
        print(f"✓ Loaded {len(news_df)} news items")
        print(f"✓ Loaded {len(stock_df)} trading days")
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please run the data collection script first!")
        return

    # Step 1: Add sentiment features
    news_df = add_sentiment_features(news_df)

    # Step 2: Engineer stock features
    stock_df = engineer_stock_features(stock_df)

    # Step 3: Create ML dataset
    ml_df = create_ml_dataset(stock_df, news_df)

    # Step 4: Generate summary
    generate_feature_summary(ml_df)

    # Step 5: Save all processed data
    save_processed_data(news_df, stock_df, ml_df)

    print("\n✓ Week 4 Feature Engineering Complete!")
    print("\nNext step: Build prediction models using 'ml_ready_dataset.csv'")


if __name__ == "__main__":
    # Install vader if not present
    try:
        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    except ImportError:
        print("Installing vaderSentiment...")
        import subprocess
        import sys
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'vaderSentiment'])
        print("✓ Installed vaderSentiment")

    main()


Writing week4_sentiment_features.py


In [12]:
%%writefile week4_ml_modeling.py

"""
Week 4: Machine Learning Models for Stock Prediction
WiDS 2025 Project

This script builds and evaluates multiple ML models:
- Logistic Regression
- Random Forest
- XGBoost
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             roc_auc_score, roc_curve)
import warnings
warnings.filterwarnings('ignore')

# Try to import XGBoost, install if not available
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("Note: XGBoost not installed. Will use Logistic Regression and Random Forest only.")


class StockPredictionModel:
    """Base class for stock prediction models"""

    def __init__(self, name):
        self.name = name
        self.model = None
        self.scaler = StandardScaler()
        self.feature_importance = None

    def prepare_features(self, df, feature_cols, target_col='Target_Next_Day'):
        """Prepare features and target for modeling"""
        X = df[feature_cols].copy()
        y = df[target_col].copy()

        # Handle any remaining NaN values
        X = X.fillna(X.mean())

        return X, y

    def train(self, X_train, y_train):
        """Train the model"""
        X_train_scaled = self.scaler.fit_transform(X_train)
        self.model.fit(X_train_scaled, y_train)

    def predict(self, X):
        """Make predictions"""
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

    def predict_proba(self, X):
        """Get prediction probabilities"""
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)

    def evaluate(self, X_test, y_test):
        """Evaluate model performance"""
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]

        metrics = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1-Score': f1_score(y_test, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
        }

        return metrics, y_pred, y_pred_proba


def select_features(df):
    """Select relevant features for modeling"""
    feature_cols = [
        # Price features
        'Open', 'High', 'Low', 'Close', 'Volume',
        'Price_Change_Pct', 'Volatility',

        # Momentum features
        'Momentum_1d', 'Momentum_3d',

        # Moving averages
        'MA_5', 'MA_10',

        # Volume features
        'Volume_Change',

        # Sentiment features
        'Sentiment_Mean', 'Sentiment_Std', 'News_Count',
        'Positive_Mean', 'Negative_Mean',
        'Sentiment_Lag1', 'News_Count_Lag1'
    ]

    # Only keep features that exist in the dataframe
    available_features = [col for col in feature_cols if col in df.columns]

    return available_features


def train_logistic_regression(X_train, y_train, X_test, y_test):
    """Train and evaluate Logistic Regression model"""
    print("\n" + "="*60)
    print("LOGISTIC REGRESSION")
    print("="*60)

    model = StockPredictionModel("Logistic Regression")
    model.model = LogisticRegression(random_state=42, max_iter=1000)

    model.train(X_train, y_train)
    metrics, y_pred, y_pred_proba = model.evaluate(X_test, y_test)

    print("\nPerformance Metrics:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

    return model, metrics, y_pred, y_pred_proba


def train_random_forest(X_train, y_train, X_test, y_test):
    """Train and evaluate Random Forest model"""
    print("\n" + "="*60)
    print("RANDOM FOREST")
    print("="*60)

    model = StockPredictionModel("Random Forest")
    model.model = RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        n_jobs=-1
    )

    model.train(X_train, y_train)
    metrics, y_pred, y_pred_proba = model.evaluate(X_test, y_test)

    print("\nPerformance Metrics:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

    # Feature importance
    model.feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nTop 5 Important Features:")
    print(model.feature_importance.head().to_string(index=False))

    return model, metrics, y_pred, y_pred_proba


def train_xgboost(X_train, y_train, X_test, y_test):
    """Train and evaluate XGBoost model"""
    if not XGBOOST_AVAILABLE:
        print("\nXGBoost not available - skipping")
        return None, None, None, None

    print("\n" + "="*60)
    print("XGBOOST")
    print("="*60)

    model = StockPredictionModel("XGBoost")
    model.model = XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    model.train(X_train, y_train)
    metrics, y_pred, y_pred_proba = model.evaluate(X_test, y_test)

    print("\nPerformance Metrics:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

    # Feature importance
    model.feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nTop 5 Important Features:")
    print(model.feature_importance.head().to_string(index=False))

    return model, metrics, y_pred, y_pred_proba


def plot_confusion_matrix(y_test, y_pred, model_name):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Down', 'Up'],
                yticklabels=['Down', 'Up'])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_")}.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✓ Saved confusion_matrix_{model_name.replace(' ', '_')}.png")


def plot_roc_curve(y_test, y_pred_proba, model_name):
    """Plot ROC curve"""
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc_score = roc_auc_score(y_test, y_pred_proba)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_score:.3f})', linewidth=2)
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'roc_curve_{model_name.replace(" ", "_")}.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✓ Saved roc_curve_{model_name.replace(' ', '_')}.png")


def plot_feature_importance(model):
    """Plot feature importance"""
    if model.feature_importance is None:
        return

    top_features = model.feature_importance.head(10)

    plt.figure(figsize=(10, 6))
    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Importance')
    plt.title(f'Top 10 Feature Importance - {model.name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(f'feature_importance_{model.name.replace(" ", "_")}.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✓ Saved feature_importance_{model.name.replace(' ', '_')}.png")


def compare_models(results):
    """Create comparison of all models"""
    comparison_df = pd.DataFrame(results).T

    print("\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    print("\n" + comparison_df.to_string())

    # Plot comparison
    comparison_df.plot(kind='bar', figsize=(12, 6))
    plt.title('Model Performance Comparison')
    plt.ylabel('Score')
    plt.xlabel('Model')
    plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("\n✓ Saved model_comparison.png")

    return comparison_df


def main():
    """Main execution function"""
    print("="*60)
    print("Week 4: Machine Learning Models")
    print("="*60 + "\n")

    # Load ML-ready dataset
    try:
        df = pd.read_csv('ml_ready_dataset.csv')
        print(f"✓ Loaded dataset with {len(df)} samples")
    except FileNotFoundError:
        print("Error: ml_ready_dataset.csv not found!")
        print("Please run week4_sentiment_features.py first")
        return

    # Select features
    feature_cols = select_features(df)
    print(f"✓ Selected {len(feature_cols)} features")

    # Prepare data
    X = df[feature_cols].fillna(df[feature_cols].mean())
    y = df['Target_Next_Day']

    # Split data (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"\nTraining set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    print(f"Class distribution: {dict(y.value_counts())}")

    # Train models
    results = {}

    # 1. Logistic Regression
    lr_model, lr_metrics, lr_pred, lr_proba = train_logistic_regression(
        X_train, y_train, X_test, y_test
    )
    results['Logistic Regression'] = lr_metrics
    plot_confusion_matrix(y_test, lr_pred, "Logistic Regression")
    plot_roc_curve(y_test, lr_proba, "Logistic Regression")

    # 2. Random Forest
    rf_model, rf_metrics, rf_pred, rf_proba = train_random_forest(
        X_train, y_train, X_test, y_test
    )
    results['Random Forest'] = rf_metrics
    plot_confusion_matrix(y_test, rf_pred, "Random Forest")
    plot_roc_curve(y_test, rf_proba, "Random Forest")
    plot_feature_importance(rf_model)

    # 3. XGBoost (if available)
    if XGBOOST_AVAILABLE:
        xgb_model, xgb_metrics, xgb_pred, xgb_proba = train_xgboost(
            X_train, y_train, X_test, y_test
        )
        if xgb_metrics:
            results['XGBoost'] = xgb_metrics
            plot_confusion_matrix(y_test, xgb_pred, "XGBoost")
            plot_roc_curve(y_test, xgb_proba, "XGBoost")
            plot_feature_importance(xgb_model)

    # Compare models
    comparison_df = compare_models(results)
    comparison_df.to_csv('model_comparison_results.csv')
    print("✓ Saved model_comparison_results.csv")

    # Save best model info
    best_model = comparison_df['Accuracy'].idxmax()
    best_accuracy = comparison_df['Accuracy'].max()

    print("\n" + "="*60)
    print(f"BEST MODEL: {best_model}")
    print(f"Accuracy: {best_accuracy:.4f}")
    print("="*60)

    print("\n✓ Week 4 Modeling Complete!")
    print("\nGenerated files:")
    print("  - model_comparison_results.csv")
    print("  - confusion_matrix_*.png")
    print("  - roc_curve_*.png")
    print("  - feature_importance_*.png")
    print("  - model_comparison.png")


if __name__ == "__main__":
    # Install required packages if needed
    packages = {
        'matplotlib': 'matplotlib',
        'seaborn': 'seaborn',
        'sklearn': 'scikit-learn'
    }

    import subprocess
    import sys

    for import_name, package_name in packages.items():
        try:
            __import__(import_name)
        except ImportError:
            print(f"Installing {package_name}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package_name])

    # Try to install XGBoost
    try:
        import xgboost
    except ImportError:
        print("Note: XGBoost not installed. Install with: pip install xgboost")

    main()


Writing week4_ml_modeling.py


# Main file for running complete project from dataset creation to testing

In [13]:
"""
COMPLETE WiDS 2025 PROJECT RUNNER
Runs all weeks: Data Collection + Week 4 (Sentiment & ML)

Execute this script to generate everything needed for submission
"""

import subprocess
import sys
import os
from datetime import datetime

def print_header(text):
    """Print formatted header"""
    print("\n" + "="*70)
    print(f"  {text}")
    print("="*70 + "\n")

def install_packages():
    """Install all required packages"""
    print_header("INSTALLING PACKAGES")

    packages = [
        'pandas', 'numpy',
        'yfinance', 'requests', 'beautifulsoup4', 'lxml',
        'vaderSentiment',
        'scikit-learn', 'matplotlib', 'seaborn',
        'xgboost'
    ]

    for package in packages:
        try:
            __import__(package.replace('-', '_'))
            print(f"✓ {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
            print(f"✓ {package} installed")

def run_data_collection():
    """Run data collection script"""
    print_header("STEP 1: DATA COLLECTION (Midterm)")

    try:
        from complete_data_collection import main
        main()
        return True
    except Exception as e:
        print(f"Error in data collection: {e}")
        return False

def run_sentiment_analysis():
    """Run Week 4 sentiment analysis"""
    print_header("STEP 2: SENTIMENT ANALYSIS & FEATURE ENGINEERING (Week 4)")

    try:
        from week4_sentiment_features import main
        main()
        return True
    except Exception as e:
        print(f"Error in sentiment analysis: {e}")
        return False

def run_ml_modeling():
    """Run Week 4 machine learning models"""
    print_header("STEP 3: MACHINE LEARNING MODELING (Week 4)")

    try:
        from week4_ml_modeling import main
        main()
        return True
    except Exception as e:
        print(f"Error in ML modeling: {e}")
        return False

def verify_all_files():
    """Verify all required files were generated"""
    print_header("VERIFICATION")

    required_files = {
        'Midterm': [
            'News_raw.csv',
            'news_cleaned.csv',
            'stock_data.csv',
            'merged_medsem_data.csv'
        ],
        'Week 4 - Data': [
            'news_with_sentiment.csv',
            'stock_with_features.csv',
            'ml_ready_dataset.csv',
            'model_comparison_results.csv'
        ],
        'Week 4 - Visualizations': [
            'confusion_matrix_Logistic_Regression.png',
            'confusion_matrix_Random_Forest.png',
            'roc_curve_Logistic_Regression.png',
            'roc_curve_Random_Forest.png',
            'feature_importance_Random_Forest.png',
            'model_comparison.png'
        ],
        'Documentation': [
            'complete_data_collection.py',
            'week4_sentiment_features.py',
            'week4_ml_modeling.py',
            'README.md',
            'WEEK4_README.md',
            'requirements.txt'
        ]
    }

    all_present = True
    for category, files in required_files.items():
        print(f"\n{category}:")
        for file in files:
            exists = os.path.exists(file)
            status = "✓" if exists else "✗"
            print(f"  {status} {file}")
            if not exists:
                all_present = False

    return all_present

def create_submission_package():
    """Create organized submission folder"""
    print_header("CREATING SUBMISSION PACKAGE")

    import shutil

    folder_name = f"WiDS2025_COMPLETE_Submission_{datetime.now().strftime('%Y%m%d')}"

    if os.path.exists(folder_name):
        shutil.rmtree(folder_name)

    os.makedirs(folder_name)
    os.makedirs(f"{folder_name}/visualizations")
    os.makedirs(f"{folder_name}/data")
    os.makedirs(f"{folder_name}/scripts")

    # Copy scripts
    scripts = [
        'complete_data_collection.py',
        'week4_sentiment_features.py',
        'week4_ml_modeling.py'
    ]
    for script in scripts:
        if os.path.exists(script):
            shutil.copy(script, f"{folder_name}/scripts/")

    # Copy data files
    data_files = [
        'News_raw.csv', 'news_cleaned.csv', 'stock_data.csv',
        'merged_medsem_data.csv', 'news_with_sentiment.csv',
        'stock_with_features.csv', 'ml_ready_dataset.csv',
        'model_comparison_results.csv'
    ]
    for file in data_files:
        if os.path.exists(file):
            shutil.copy(file, f"{folder_name}/data/")

    # Copy visualizations
    import glob
    for png_file in glob.glob('*.png'):
        shutil.copy(png_file, f"{folder_name}/visualizations/")

    # Copy documentation
    docs = ['README.md', 'WEEK4_README.md', 'requirements.txt']
    for doc in docs:
        if os.path.exists(doc):
            shutil.copy(doc, folder_name)

    # Copy notebook if exists
    if os.path.exists('midterm_submission.ipynb'):
        shutil.copy('midterm_submission.ipynb', folder_name)

    print(f"\n✓ Created submission package: {folder_name}/")
    print("\nFolder structure:")
    print(f"{folder_name}/")
    print("├── scripts/              (Python scripts)")
    print("├── data/                 (All CSV files)")
    print("├── visualizations/       (All PNG plots)")
    print("├── README.md")
    print("├── WEEK4_README.md")
    print("├── requirements.txt")
    print("└── midterm_submission.ipynb")

    return folder_name

def generate_summary():
    """Generate project summary"""
    print_header("PROJECT SUMMARY")

    # Count files
    import glob
    csv_count = len(glob.glob('*.csv'))
    png_count = len(glob.glob('*.png'))
    py_count = len(glob.glob('*.py'))

    print("Files Generated:")
    print(f"  CSV Data Files: {csv_count}")
    print(f"  Visualizations: {png_count}")
    print(f"  Python Scripts: {py_count}")

    # Print data summary if available
    try:
        import pandas as pd

        # Midterm data
        stock_df = pd.read_csv('stock_data.csv')
        news_df = pd.read_csv('news_cleaned.csv')

        print("\nMidterm Data:")
        print(f"  Trading Days: {len(stock_df)}")
        print(f"  News Headlines: {len(news_df)}")
        print(f"  Date Range: {stock_df['Date'].min()} to {stock_df['Date'].max()}")

        # Week 4 data
        ml_df = pd.read_csv('ml_ready_dataset.csv')
        results_df = pd.read_csv('model_comparison_results.csv', index_col=0)

        print("\nWeek 4 Results:")
        print(f"  ML Dataset Size: {len(ml_df)} samples")
        print(f"  Features Used: {len(ml_df.columns) - 1}")
        print("\n  Best Model Performance:")
        best_model = results_df['Accuracy'].idxmax()
        best_acc = results_df['Accuracy'].max()
        print(f"    Model: {best_model}")
        print(f"    Accuracy: {best_acc:.4f}")
        print(f"    ROC-AUC: {results_df.loc[best_model, 'ROC-AUC']:.4f}")

    except Exception as e:
        print(f"\nCouldn't load summary data: {e}")

def main():
    """Main execution"""
    print("="*70)
    print("  WiDS 2025 - COMPLETE PROJECT EXECUTION")
    print("  Stock Volatility Prediction using News Sentiment")
    print("="*70)

    start_time = datetime.now()

    # Step 0: Install packages
    install_packages()

    # Step 1: Data collection (Midterm)
    if not run_data_collection():
        print("\n⚠ Data collection failed!")
        return

    # Step 2: Sentiment analysis (Week 4)
    if not run_sentiment_analysis():
        print("\n⚠ Sentiment analysis failed!")
        return

    # Step 3: ML modeling (Week 4)
    if not run_ml_modeling():
        print("\n⚠ ML modeling failed!")
        return

    # Step 4: Verify all files
    if not verify_all_files():
        print("\n⚠ Some files are missing!")

    # Step 5: Create submission package
    folder_name = create_submission_package()

    # Step 6: Generate summary
    generate_summary()

    # Final message
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    print_header("✓ PROJECT COMPLETE!")

    print(f"Total execution time: {duration:.1f} seconds")
    print(f"\nYour submission is ready in: {folder_name}/")
    print("\nNext steps:")
    print("  1. Review the generated visualizations")
    print("  2. Check model_comparison_results.csv")
    print("  3. Read WEEK4_README.md for detailed explanations")
    print("  4. Zip the submission folder")
    print("  5. Submit to your instructor")
    print("\n" + "="*70)
    print("  GOOD LUCK WITH YOUR SUBMISSION!")
    print("="*70 + "\n")

if __name__ == "__main__":
    main()

  WiDS 2025 - COMPLETE PROJECT EXECUTION
  Stock Volatility Prediction using News Sentiment

  INSTALLING PACKAGES

✓ pandas already installed
✓ numpy already installed
✓ yfinance already installed
✓ requests already installed
Installing beautifulsoup4...
✓ beautifulsoup4 installed
✓ lxml already installed
✓ vaderSentiment already installed
Installing scikit-learn...
✓ scikit-learn installed
✓ matplotlib already installed
✓ seaborn already installed
✓ xgboost already installed

  STEP 1: DATA COLLECTION (Midterm)

WiDS 2025 - Stock Market & News Sentiment Analysis
Data Collection Script

Collecting news data from local file 'CAT.rss.xml'...
Successfully collected 495 news items from 'CAT.rss.xml'
Cleaning news data...
Cleaned data: 495 rows with dates from 2025-04-17 to 2026-01-24
Collecting stock data for CAT...
Successfully collected 1255 days of stock data
Merging datasets...
Merged dataset: 1524 rows
News coverage: 411 days have news

Saving datasets...
✓ Saved News_raw.csv
✓ Saved