In [None]:
import re
from bs4 import BeautifulSoup
import nltk
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, cohen_kappa_score
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np
import torch
import os
import logging
from tqdm import tqdm
import transformers

In [None]:
print(transformers.__version__)

4.56.1


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Download NLTK data
nltk.download('punkt_tab', quiet=True)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    logger.error("SpaCy model 'en_core_web_sm' not found. Install it with: python -m spacy download en_core_web_sm")
    raise

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Set a batch size for processing - Reduced batch size
batch_size = 8  # You can adjust this value based on your GPU memory
print(f"Using batch size: {batch_size}")

Using batch size: 8


In [None]:
import yfinance as yf

def create_ticker_map_csv(output_path="/content/company_tickers.csv"):
    sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
    tickers = sp500['Symbol'].tolist()
    data = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info
        company_name = info.get("longName", ticker)
        canonical_name = info.get("shortName", company_name.split()[0])
        data.append({"Company_Name": company_name, "Ticker": ticker, "Canonical_Name": canonical_name})
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)
    return output_path

In [None]:
# Function to load entity map from CSV
def load_entity_map(csv_path="/content/company_tickers.csv"):
    """Load company-ticker mappings from a CSV file."""
    df = pd.read_csv(csv_path)
    entity_map = {}
    for _, row in df.iterrows():
        # Store both Company Name and Ticker mapping to Canonical Name
        entity_map[str(row['Company_Name'])] = str(row['Canonical_Name'])
        entity_map[str(row['Ticker'])] = str(row['Canonical_Name'])
    return entity_map

In [None]:
# Step 1: Preprocessing with SpaCy NER
def preprocess_text(text):
    """Clean and preprocess financial text using SpaCy NER."""
    try:
        text = text[:10000]  # Limit to 10,000 characters
        soup = BeautifulSoup(text, 'html.parser')
        cleaned_text = soup.get_text()
        cleaned_text = re.sub(r'[^\w\s\$\%\.\,]', '', cleaned_text)
        sentences = nltk.sent_tokenize(cleaned_text)
        normalized_sentences = []
        for sentence in sentences:
            if len(sentence) > 1000:
                continue
            normalized_sentence = sentence
            for ticker, canonical in load_entity_map(csv_path="/content/company_tickers.csv").items():
                pattern = r'\b' + re.escape(ticker) + r'\b'
                normalized_sentence = re.sub(pattern, canonical, normalized_sentence, flags=re.IGNORECASE)
            doc = nlp(normalized_sentence)
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    canonical_name = ent.text.split()[0]
                    normalized_sentence = normalized_sentence.replace(ent.text, canonical_name)
            normalized_sentences.append(normalized_sentence)
        return normalized_sentences if normalized_sentences else [text[:512]]
    except Exception as e:
        logger.error(f"Error preprocessing text: {e}")
        return [text[:512]]

In [None]:
# Step 2: Named Entity Recognition
def extract_entities(sentences):
    """Extract entities using FinBERT."""
    try:
        ner_pipeline = pipeline(
            "ner",
            model="ProsusAI/finbert",
            tokenizer="ProsusAI/finbert",
            aggregation_strategy="simple",
            max_length=512,
            truncation=True,
            device=0 if torch.cuda.is_available() else -1
        )
        entities = []
        for sentence in sentences:
            ner_results = ner_pipeline(sentence)
            sentence_entities = [{"text": entity["word"], "entity": entity["entity_group"], "score": entity["score"]} for entity in ner_results]
            entities.append({"sentence": sentence, "entities": sentence_entities})
        return entities
    except Exception as e:
        logger.error(f"Error in NER: {e}")
        return [{"sentence": s, "entities": []} for s in sentences]

In [None]:
# Step 3: Relevance Classification using FinBERT
def classify_relevance(sentences):
  """Classify sentences as financially relevant."""
  try:
        classifier = pipeline("text-classification",
            model="ProsusAI/finbert",
            tokenizer="ProsusAI/finbert",
            max_length=512,
            truncation=True
        )
        relevance_results = []
        for sentence in sentences:
            result = classifier(sentence)
            is_relevant = result[0]["label"] == "positive" and result[0]["score"] > 0.7
            relevance_results.append({"sentence": sentence, "is_relevant": is_relevant, "score": result[0]["score"]})
        return relevance_results
  except Exception as e:
        logger.error(f"Error in relevance classification: {e}")
        return [{"sentence": s, "is_relevant": False, "score": 0.0} for s in sentences]

In [None]:
# Step 4: Sentiment Analysis
def analyze_sentiment(sentences, model_id="ProsusAI/finbert"):
    """Classify sentiment using a BERT-based model."""
    try:
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model=model_id,
            tokenizer=model_id,
            max_length=512,
            truncation=True
        )
        sentiment_results = []
        for sentence in sentences:
            result = sentiment_pipeline(sentence)
            # Map FinBERT's three-class output to binary (positive vs. negative)
            label = result[0]["label"].lower()
            binary_label = "positive" if label == "positive" else "negative"  # Neutral maps to negative
            sentiment_results.append({
                "sentence": sentence,
                "sentiment": binary_label,
                "score": result[0]["score"]
            })
        return sentiment_results
    except Exception as e:
        logger.error(f"Error in sentiment analysis with {model_id}: {e}")
        return [{"sentence": s, "sentiment": "negative", "score": 0.0} for s in sentences]

In [None]:
# Step 5: Load Combined_News_DJIA.csv
from datasets import ClassLabel

def load_djia_dataset(file_path='/content/Combined_News_DJIA.csv', max_rows=None):
    """Load Combined_News_DJIA dataset from URL or local file."""
    try:
        if file_path:
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        else:
            url = "https://raw.githubusercontent.com/niharikabalachandra/Stock-Market-Prediction-Using-Natural-Language-Processing/master/Combined_News_DJIA.csv"
            df = pd.read_csv(url, encoding='utf-8', low_memory=False)

        logger.info(f"CSV loaded: {df.shape}, Columns: {df.columns.tolist()}")

        df['text'] = df[['Top' + str(i) for i in range(1, 26)]].apply(lambda row: ' '.join(row.astype(str)), axis=1)
        df = df.dropna(subset=['text', 'Label'])
        df['text'] = df['text'].astype(str)

        if max_rows is not None:
            df = df.sample(n=min(max_rows, len(df)), random_state=42)
            logger.info(f"Subsampled to {len(df)} rows")

        df['label'] = df['Label'].astype(int)  # 0 = negative, 1 = positive

        dataset = Dataset.from_pandas(df[['text', 'label']])

        # Cast 'label' column to ClassLabel for stratification
        dataset = dataset.cast_column('label', ClassLabel(names=[0, 1]))

        if len(dataset) < 2:
            raise ValueError("Dataset is too small for train-test split")

        try:
            dataset = dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='label')
        except ValueError as e:
            logger.warning(f"Stratified split failed: {e}. Using non-stratified split.")
            dataset = dataset.train_test_split(test_size=0.2, seed=42)

        logger.info(f"Dataset prepared: {len(dataset['train'])} train, {len(dataset['test'])} test")
        return dataset
    except Exception as e:
        logger.error(f"Error loading DJIA CSV: {e}")
        sample_data = {
            "text": ["Apple reported strong earnings.", "Sales declined sharply."],
            "label": [1, 0]
        }
        df_sample = pd.DataFrame(sample_data)
        dataset = Dataset.from_pandas(df_sample)
        # Cast 'label' in sample data as well
        dataset = dataset.cast_column('label', ClassLabel(names=[0, 1]))
        dataset = dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='label')
        logger.info("Using sample data as fallback.")
        return dataset

In [None]:
# Step 6: Fine-Tune FinBERT with LoRA
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
import os
import logging

logger = logging.getLogger(__name__)


def fine_tune_finbert_peft(file_path="./Combined_News_DJIA.csv", output_dir="./finbert-finetuned-djia-lora", max_rows=None):
    """Fine-tune FinBERT on Combined_News_DJIA dataset using LoRA."""
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {device}")

        dataset = load_djia_dataset(file_path, max_rows)
        if dataset is None or len(dataset["train"]) == 0:
            raise ValueError("Failed to load dataset")

        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert", local_files_only=False)
        # Load the base model with 3 labels to match pre-trained weights
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3, local_files_only=False)

        # Redefine the classifier head for 2 labels
        num_labels = 2
        model.classifier = torch.nn.Linear(model.classifier.in_features, num_labels)

        model.to(device)

        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=["query", "value"], # Use "query", "value" for newer transformer versions
            lora_dropout=0.1,
            bias="none",
            task_type="SEQ_CLS"
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        logger.info("LoRA model initialized")

        def tokenize_function(examples):
            return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        tokenized_dataset = tokenized_dataset.remove_columns(["text"])
        tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

        labels = np.array(tokenized_dataset["train"]["labels"])
        unique_labels = np.unique(labels)
        expected_labels = np.array([0, 1]) # Expecting labels 0 and 1 after ClassLabel casting

        class_weights = compute_class_weight("balanced", classes=np.array([0, 1]), y=labels) # Compute weights for expected labels 0 and 1

        class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
        logger.info(f"Class weights: {class_weights}")

        class WeightedTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False):
                labels = inputs.get("labels").to(device)
                outputs = model(**{k: v.to(device) for k, v in inputs.items()})
                logits = outputs.get("logits")
                loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
                loss = loss_fct(logits, labels)
                return (loss, outputs) if return_outputs else loss

        training_args = TrainingArguments(
            output_dir=output_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            learning_rate=2e-4,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            fp16=torch.cuda.is_available(),
            logging_dir="./logs",
            logging_steps=10,
            report_to="none" # Disable reporting to external services
        )

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
            kappa = cohen_kappa_score(labels, predictions)
            return {"precision": precision, "recall": recall, "f1": f1, "kappa": kappa}

        trainer = WeightedTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["test"],
            compute_metrics=compute_metrics,
        )

        logger.info("Starting fine-tuning with LoRA...")
        trainer.train()

        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        logger.info(f"LoRA-adapted model saved to {output_dir}")
        return output_dir
    except Exception as e:
        logger.error(f"Error in fine-tuning: {e}")
        return "ProsusAI/finbert" # Return base model path on error

In [None]:
# Step 7: Summarization
def summarize_text(text):
    """Generate summary using T5, handling short inputs."""
    try:
        if len(text.strip()) < 10:  # Skip very short texts
            logger.warning(f"Text too short for summarization: {text[:50]}...")
            return text[:50]

        summarizer = pipeline(
            "summarization",
            model="t5-base",
            tokenizer="t5-base",
            max_length=512,  # Input max_length
            truncation=True,
            device=0 if torch.cuda.is_available() else -1
        )
        input_length = len(text.split())
        output_max_length = max(10, min(50, input_length // 2))  # Dynamic output length
        summary = summarizer(text[:512], max_length=output_max_length, min_length=5, do_sample=False)
        return summary[0]["summary_text"]
    except Exception as e:
        logger.error(f"Error in summarization: {e}")
        return text[:50]

In [None]:
# Step 8: Aggregate Sentiment
def aggregate_sentiment(sentiment_results):
    """Aggregate sentiment scores."""
    try:
        sentiments = [r["sentiment"] for r in sentiment_results]
        scores = [r["score"] for r in sentiment_results]
        sentiment_counts = pd.Series(sentiments).value_counts()
        majority_sentiment = sentiment_counts.idxmax() if not sentiment_counts.empty else "negative"
        weighted_score = sum(scores) / len(scores) if scores else 0
        return {"majority_sentiment": majority_sentiment, "confidence": weighted_score}
    except Exception as e:
        logger.error(f"Error aggregating sentiment: {e}")
        return {"majority_sentiment": "negative", "confidence": 0}

In [None]:
# Step 9: Evaluation Metrics
def evaluate_model(true_labels, predicted_labels):
    """Compute evaluation metrics."""
    try:
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average="weighted")
        kappa = cohen_kappa_score(true_labels, predicted_labels)
        return {"precision": precision, "recall": recall, "f1": f1, "kappa": kappa}
    except Exception as e:
        logger.error(f"Error evaluating model: {e}")
        return {"precision": 0, "recall": 0, "f1": 0, "kappa": 0}

In [None]:
# Step 10: Predict Sentiments with Fine-Tuned LoRA Model
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from peft import PeftModel
import pandas as pd
import torch
from tqdm import tqdm
import logging

logger = logging.getLogger(__name__)


def predict_sentiment_djia(
    file_path="./Combined_News_DJIA.csv",
    model_path="./finbert-finetuned-djia-lora",
    output_path="DJIA_Predicted_Sentiments.csv",
    batch_size=16,
    max_rows=None
):
    """Predict sentiment for Combined_News_DJIA dataset using the fine-tuned LoRA model."""
    try:
        if file_path:
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        else:
            url = "https://raw.githubusercontent.com/niharikabalachandra/Stock-Market-Prediction-Using-Natural-Language-Processing/master/Combined_News_DJIA.csv"
            df = pd.read_csv(url, encoding='utf-8', low_memory=False)

        df['text'] = df[['Top' + str(i) for i in range(1, 26)]].apply(lambda row: ' '.join(row.astype(str)), axis=1)
        df = df.dropna(subset=['text'])
        df['text'] = df['text'].astype(str)

        if max_rows is not None:
            df = df.sample(n=min(max_rows, len(df)), random_state=42)
            logger.info(f"Subsampled to {len(df)} rows")

        # Load the base model with 3 labels and then the PEFT adapter
        base_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

        # Load the LoRA adapter and merge using the model_path
        model = PeftModel.from_pretrained(base_model, model_path)
        model = model.merge_and_unload()

        # Ensure the merged model has the correct classifier head for 2 labels
        if model.classifier.out_features != 2:
             logger.warning(f"Model classifier has {model.classifier.out_features} output features, expected 2. Redefining.")
             model.classifier = torch.nn.Linear(model.classifier.in_features, 2)


        device = 0 if torch.cuda.is_available() else -1
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model=model,
            tokenizer=tokenizer,
            max_length=512,
            truncation=True,
            device=device
        )
        logger.info(f"Loaded LoRA-adapted model from {model_path}")

        predictions = []
        scores = []
        # Adjust label map for 2 output classes (0: negative, 1: positive)
        label_map = {
            "LABEL_0": "negative",
            "LABEL_1": "positive",
            "negative": "negative",
            "positive": "positive"
        }


        texts = df['text'].tolist()
        for i in tqdm(range(0, len(texts), batch_size), desc="Predicting sentiments"):
            batch_texts = texts[i:i + batch_size]
            batch_texts = [text[:512] for text in batch_texts]
            results = sentiment_pipeline(batch_texts)

            for result in results:
                label = result["label"]
                predicted_label = label_map.get(label, "negative") # Default to negative if label is unexpected
                predictions.append(predicted_label)
                scores.append(result["score"])

        df['predicted_sentiment'] = predictions
        df['sentiment_score'] = scores

        df.to_csv(output_path, index=False)
        logger.info(f"Predictions saved to {output_path}")

        logger.info("Sample predictions:")
        logger.info(df[['text', 'predicted_sentiment', 'sentiment_score']].head().to_string())

        return df
    except Exception as e:
        logger.error(f"Error predicting sentiments: {e}")
        return None

In [None]:
# Main Pipeline
def financial_insight_pipeline(texts, model_path="ProsusAI/finbert"):
    """Complete pipeline with configurable sentiment model."""
    results = []
    for text in texts:
        sentences = preprocess_text(text)
        entities = extract_entities(sentences)
        relevance_results = classify_relevance(sentences)
        relevant_sentences = [r["sentence"] for r in relevance_results if r["is_relevant"]]
        if not relevant_sentences:  # Ensure at least one sentence for sentiment
            relevant_sentences = sentences[:1] or [text[:512]]
            logger.warning(f"No relevant sentences for text: {text[:50]}... Using first sentence.")
        sentiment_results = analyze_sentiment(relevant_sentences, model_id=model_path) # Corrected argument name
        article_summary = summarize_text(" ".join(sentences)) if sentences else text[:50]
        aggregated_sentiment = aggregate_sentiment(sentiment_results)
        results.append({
            "original_text": text,
            "preprocessed_sentences": sentences,
            "entities": entities,
            "relevance": relevance_results,
            "sentiments": sentiment_results,
            "summary": article_summary,
            "aggregated_sentiment": aggregated_sentiment
        })
    return results

In [None]:
# Load the fine-tuned FinBERT model
def load_fine_tuned_model(model_path="/content/finbert-finetuned-djia"):
  """Load the fine-tuned FinBERT model for sentiment prediction."""
  try:
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model=model_path,
            tokenizer=model_path,
            max_length=512,
            truncation=True,
            device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
              # Batch size for inference
        )
        logger.info(f"Fine-tuned model loaded from {model_path}")
        return sentiment_pipeline
  except Exception as e:
        logger.error(f"Error loading fine-tuned model: {e}")
        logger.warning("Falling back to pre-trained ProsusAI/finbert")
        return pipeline(
            "sentiment-analysis",
            model="ProsusAI/finbert",
            tokenizer="ProsusAI/finbert",
            max_length=512,
            truncation=True,
            device=0 if torch.cuda.is_available() else -1,

        )


In [None]:
# Step 11: Evaluation Metrics
# Evaluation
def run_evaluation(pipeline_results, true_labels):
    """Evaluate sentiment classification."""
    try:
        predicted_labels = [r["aggregated_sentiment"]["majority_sentiment"] for r in pipeline_results]
        logger.info(f"Number of predicted labels: {len(predicted_labels)}")
        logger.info(f"Number of true labels: {len(true_labels)}")
        logger.info(f"Sample predicted labels: {predicted_labels[:5]}")
        logger.info(f"Sample true labels: {true_labels[:5]}")

        if len(true_labels) != len(predicted_labels):
            logger.error(f"Mismatch in number of true and predicted labels: {len(true_labels)} vs {len(predicted_labels)}")
            # Truncate to minimum length to allow evaluation
            min_length = min(len(true_labels), len(predicted_labels))
            true_labels = true_labels[:min_length]
            predicted_labels = predicted_labels[:min_length]
            logger.warning(f"Truncated to {min_length} samples for evaluation")

        metrics = evaluate_model(true_labels, predicted_labels)
        logger.info("Evaluation Metrics:")
        logger.info(f"Precision: {metrics['precision']:.3f}")
        logger.info(f"Recall: {metrics['recall']:.3f}")
        logger.info(f"F1-Score: {metrics['f1']:.3f}")
        logger.info(f"Cohen's Kappa: {metrics['kappa']:.3f}")
    except Exception as e:
        logger.error(f"Error in evaluation: {e}")

In [None]:
def load_real_dataset(max_rows=10):
    """Load Combined_News_DJIA dataset for testing."""
    try:
        url = "https://raw.githubusercontent.com/niharikabalachandra/Stock-Market-Prediction-Using-Natural-Language-Processing/master/Combined_News_DJIA.csv"
        df = pd.read_csv(url)
        df['combined_news'] = df[['Top' + str(i) for i in range(1, 26)]].apply(lambda row: ' '.join(row.astype(str)), axis=1)
        df = df.head(max_rows)
        return df['combined_news'].tolist(), df['Label'].map({0: "negative", 1: "positive"}).tolist()
    except Exception as e:
        logger.error(f"Error loading real dataset: {e}")
        return ["Sample text for testing."] * max_rows, ["negative"] * max_rows

In [None]:
# Run the Pipeline
if __name__ == "__main__":
    logger.info("Running Financial Insight Extraction Pipeline with LoRA Fine-Tuning on DJIA...")

    # Fine-tune with LoRA (test with 1000 rows, set max_rows=None for full dataset)
    fine_tuned_model_path = fine_tune_finbert_peft(max_rows=1000)

    # Predict sentiments on DJIA dataset
    predict_sentiment_djia(model_path=fine_tuned_model_path, max_rows=1000)

    # Run full pipeline on Combined_News_DJIA for testing
    texts, true_labels = load_real_dataset(max_rows=10)
    pipeline_results = financial_insight_pipeline(texts, model_path=fine_tuned_model_path)

    run_evaluation(pipeline_results, true_labels)

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

ERROR:__main__:Error in fine-tuning: fine_tune_finbert_peft.<locals>.WeightedTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
ERROR:__main__:Error predicting sentiments: Can't find 'adapter_config.json' at 'ProsusAI/finbert'
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Exception ignored in: <function _xla_gc_callback at 0x7940c08f6480>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation

KeyboardInterrupt: 

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the fine-tuned FinBERT model
def load_fine_tuned_model(model_path="./finbert-finetuned-djia"):
    """Load the fine-tuned FinBERT model for sentiment prediction."""
    try:
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model=model_path,
            tokenizer=model_path,
            max_length=512,
            truncation=True,
            device=0 if torch.cuda.is_available() else -1
        )
        logger.info(f"Fine-tuned model loaded from {model_path}")
        return sentiment_pipeline
    except Exception as e:
        logger.error(f"Error loading fine-tuned model: {e}")
        logger.warning("Falling back to pre-trained ProsusAI/finbert")
        return pipeline(
            "sentiment-analysis",
            model="ProsusAI/finbert",
            tokenizer="ProsusAI/finbert",
            max_length=512,
            truncation=True,
            device=0 if torch.cuda.is_available() else -1
        )

# Compute evaluation metrics
def compute_metrics(true_labels, predicted_labels):
    """Compute precision, recall, F1-score, accuracy, and Cohen's Kappa."""
    try:
        # Log unique labels for debugging
        logger.info(f"Unique true labels: {set(true_labels)}")
        logger.info(f"Unique predicted labels: {set(predicted_labels)}")

        # Ensure labels are strings
        true_labels = [str(label) for label in true_labels]
        predicted_labels = [str(label) for label in predicted_labels]

        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average="weighted")
        kappa = cohen_kappa_score(true_labels, predicted_labels)
        accuracy = accuracy_score(true_labels, predicted_labels)
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "kappa": kappa,
            "accuracy": accuracy
        }
    except Exception as e:
        logger.error(f"Error computing metrics: {e}")
        return {
            "precision": 0.0,
            "recall": 0.0,
            "f1": 0.0,
            "kappa": 0.0,
            "accuracy": 0.0
        }

# Evaluate using Combined_News_DJIA test split
def evaluate_djia(
    file_path=None,
    model_path="./finbert-finetuned-djia",
    test_size=0.2,
    max_rows=None,
    binary=True
):
    """Evaluate the fine-tuned FinBERT model using a test split from Combined_News_DJIA."""
    try:
        # Load the CSV
        if file_path:
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        else:
            url = "https://raw.githubusercontent.com/niharikabalachandra/Stock-Market-Prediction-Using-Natural-Language-Processing/master/Combined_News_DJIA.csv"
            df = pd.read_csv(url, encoding='utf-8', low_memory=False)

        logger.info(f"DJIA CSV loaded: {df.shape}, Columns: {df.columns.tolist()}")

        # Combine Top1 to Top25 into a single text column
        df['text'] = df[['Top' + str(i) for i in range(1, 26)]].apply(lambda row: ' '.join(row.astype(str)), axis=1)
        df = df.dropna(subset=['text', 'Label'])
        df['text'] = df['text'].astype(str)
        df['true_sentiment'] = df['Label'].map({0: "negative", 1: "positive"})

        # Subsample for testing (optional)
        if max_rows is not None:
            df = df.sample(n=min(max_rows, len(df)), random_state=42)
            logger.info(f"Subsampled to {len(df)} rows")

        # Create Dataset and convert true_sentiment to ClassLabel
        dataset = Dataset.from_pandas(df[['text', 'true_sentiment']])
        dataset = dataset.cast_column('true_sentiment', ClassLabel(names=["negative", "positive"]))

        # Split with stratification
        dataset = dataset.train_test_split(test_size=test_size, seed=42, stratify_by_column='true_sentiment')
        df_test = dataset['test'].to_pandas()

        # Convert true_sentiment back to string to avoid numeric indices
        df_test['true_sentiment'] = df_test['true_sentiment'].map({0: "negative", 1: "positive"})

        # Predict sentiments
        sentiment_pipeline = load_fine_tuned_model(model_path)
        predictions = []
        scores = []
        if binary:
            label_map = {
                "LABEL_0": "negative",
                "LABEL_1": "positive",
                "negative": "negative",
                "positive": "positive",
                "neutral": "negative"  # Map neutral to negative for binary evaluation
            }
        else:
            label_map = {
                "LABEL_0": "negative",
                "LABEL_1": "neutral",
                "LABEL_2": "positive",
                "negative": "negative",
                "neutral": "neutral",
                "positive": "positive"
            }

        # Batch processing for efficiency
        batch_size = 16
        texts = df_test['text'].tolist()
        for i in tqdm(range(0, len(texts), batch_size), desc="Predicting sentiments"):
            batch_texts = texts[i:i + batch_size]
            batch_texts = [text[:512] for text in batch_texts]
            results = sentiment_pipeline(batch_texts)
            for result in results:
                label = result["label"]
                predicted_label = label_map.get(label, "negative")
                predictions.append(predicted_label)
                scores.append(result["score"])

        df_test['predicted_sentiment'] = predictions
        df_test['sentiment_score'] = scores

        # Compute metrics
        true_labels = df_test['true_sentiment'].tolist()
        predicted_labels = df_test['predicted_sentiment'].tolist()
        metrics = compute_metrics(true_labels, predicted_labels)

        # Log metrics
        logger.info("Evaluation Metrics (Combined_News_DJIA):")
        logger.info(f"Precision: {metrics['precision']:.3f}")
        logger.info(f"Recall: {metrics['recall']:.3f}")
        logger.info(f"F1-Score: {metrics['f1']:.3f}")
        logger.info(f"Cohen's Kappa: {metrics['kappa']:.3f}")
        logger.info(f"Accuracy: {metrics['accuracy']:.3f}")

        # Log sample comparisons
        logger.info("Sample comparisons:")
        logger.info(df_test[['text', 'true_sentiment', 'predicted_sentiment', 'sentiment_score']].head().to_string())

        # Save results
        df_test.to_csv("djia_evaluation_results.csv", index=False)
        logger.info("Results saved to djia_evaluation_results.csv")

        return metrics, df_test
    except Exception as e:
        logger.error(f"Error evaluating DJIA dataset: {e}")
        return None, None


# Run the evaluation
if __name__ == "__main__":
    # Primary: Evaluate on Combined_News_DJIA test split
    logger.info("Evaluating on Combined_News_DJIA...")
    metrics_djia, df_djia = evaluate_djia(max_rows=1000, binary=True)  # Limit for testing
    if metrics_djia:
        logger.info("DJIA evaluation completed successfully")



Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Device set to use cuda:0
Predicting sentiments: 100%|██████████| 13/13 [00:04<00:00,  2.93it/s]


In [None]:
df_djia_predicted= pd.read_csv('/content/djia_evaluation_results.csv')
df_djia_predicted.head()

Unnamed: 0,text,true_sentiment,__index_level_0__,predicted_sentiment,sentiment_score
0,b'Pirate Party Wins and Enters The European Pa...,positive,208,positive,0.504875
1,Iranians respond to Israeli Facebook initiativ...,positive,909,positive,0.614723
2,"German footballer Mesut Ozil donated his 300,0...",negative,1494,negative,0.649285
3,Japan Prime Minister will give up his salary u...,positive,693,positive,0.523506
4,"Mexico's Drug War: 50,000 Dead in 6 Years As a...",negative,952,negative,0.695125


Evaluating the fine-tuned FinBERT model

In [None]:
if __name__ == "__main__":
    # Primary: Evaluate on Combined_News_DJIA test split
    logger.info("Evaluating on Combined_News_DJIA...")
    metrics_djia, df_djia = evaluate_djia(max_rows=1000, binary=True)  # Limit for testing
    if metrics_djia:
        logger.info("DJIA evaluation completed successfully")
        print("\nCombined_News_DJIA Metrics:")
        print(f"Precision: {metrics_djia['precision']:.3f}")
        print(f"Recall: {metrics_djia['recall']:.3f}")
        print(f"F1-Score: {metrics_djia['f1']:.3f}")
        print(f"Cohen's Kappa: {metrics_djia['kappa']:.3f}")

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Device set to use cuda:0
Predicting sentiments: 100%|██████████| 13/13 [00:07<00:00,  1.86it/s]


Combined_News_DJIA Metrics:
Precision: 0.605
Recall: 0.600
F1-Score: 0.601
Cohen's Kappa: 0.199





Testing the Fine-tuned FinBERT model on FinSen_US_Categorized_Timestamp.csv dataset to predict the sentiments.

In [None]:
# Predict sentiments on the FinSen dataset
from tqdm import tqdm
def predict_sentiment_finsen(
    file_path="/content/FinSen_US_Categorized_Timestamp.csv",
    model_path="/content/finbert-finetuned-djia",
    output_path="/content/FinSen_US_Predicted_Sentiments.csv",
    batch_size=16,
    max_rows=None):
    """Predict sentiment for FinSen dataset using the fine-tuned FinBERT model."""
    try:
        # Load the CSV
        df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        logger.info(f"CSV loaded: {df.shape}, Columns: {df.columns.tolist()}")

        # Verify required column
        if 'Content' not in df.columns:
            raise ValueError(f"Expected 'Content' column, found {df.columns.tolist()}")

        df = df.dropna(subset=['Content'])
        df['text'] = df['Content'].astype(str)

        # Subsample for testing (optional)
        if max_rows is not None:
            df = df.sample(n=min(max_rows, len(df)), random_state=42)
            logger.info(f"Subsampled to {len(df)} rows")

        # Load the model
        sentiment_pipeline = load_fine_tuned_model(model_path)

        # Predict sentiments in batches
        predictions = []
        scores = []
        label_map = {
            "LABEL_0": "negative",
            "LABEL_1": "neutral",
            "LABEL_2": "positive",
            "negative": "negative",
            "neutral": "neutral",
            "positive": "positive"
        }

        # Process texts in batches with progress bar
        texts = df['text'].tolist()
        for i in tqdm(range(0, len(texts), batch_size), desc="Predicting sentiments"):
            batch_texts = texts[i:i + batch_size]
            batch_texts = [text[:512] for text in batch_texts]  # Pre-truncate
            results = sentiment_pipeline(batch_texts)

            for result in results:
                label = result["label"]
                predicted_label = label_map.get(label, "unknown")
                predictions.append(predicted_label)
                scores.append(result["score"])

        df['predicted_sentiment'] = predictions
        df['sentiment_score'] = scores

        # Save the updated CSV
        df.to_csv(output_path, index=False)
        logger.info(f"Predictions saved to {output_path}")

        # Log sample predictions
        logger.info("Sample predictions:")
        logger.info(df[['Content', 'predicted_sentiment', 'sentiment_score']].head().to_string())

        return df
    except Exception as e:
        logger.error(f"Error predicting sentiments: {e}")
        return None



In [None]:
# Run the prediction
if __name__ == "__main__":
    predict_sentiment_finsen()

Device set to use cuda:0
Predicting sentiments: 100%|██████████| 971/971 [04:20<00:00,  3.73it/s]


In [None]:
df_predicted = pd.read_csv("/content/FinSen_US_Predicted_Sentiments.csv")
df_predicted.head(20)

Unnamed: 0,Title,Tag,Time,Content,text,predicted_sentiment,sentiment_score
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,16/07/2023,"TSX Slightly Down, Books Weekly GainsUnited St...","TSX Slightly Down, Books Weekly GainsUnited St...",negative,0.635753
1,UnitedHealth Hits 4-week High,stocks,15/07/2023,UnitedHealth Hits 4-week HighUnited States sto...,UnitedHealth Hits 4-week HighUnited States sto...,neutral,0.510482
2,Cisco Systems Hits 4-week Low,stocks,15/07/2023,Cisco Systems Hits 4-week LowUnited States sto...,Cisco Systems Hits 4-week LowUnited States sto...,negative,0.604527
3,AT&T Hits All-time Low,stocks,15/07/2023,AT&T Hits All-time LowUnited States stocksAT&T...,AT&T Hits All-time LowUnited States stocksAT&T...,negative,0.543571
4,Microsoft Hits 4-week High,stocks,15/07/2023,Microsoft Hits 4-week HighUnited States stocks...,Microsoft Hits 4-week HighUnited States stocks...,neutral,0.546965
5,JPMorgan Hits 16-month High,stocks,15/07/2023,JPMorgan Hits 16-month HighUnited States stock...,JPMorgan Hits 16-month HighUnited States stock...,neutral,0.565644
6,US Export Prices Fall More than Expected,Export Prices MoM,15/07/2023,US Export Prices Fall More than ExpectedUnited...,US Export Prices Fall More than ExpectedUnited...,negative,0.578514
7,Citigroup earnings above expectations at 1.37 USD,Earnings,15/07/2023,Citigroup earnings above expectations at 1.37 ...,Citigroup earnings above expectations at 1.37 ...,negative,0.551805
8,US Treasury Yields Below Recent Highs,Government Bond 10Y,15/07/2023,US Treasury Yields Below Recent Highs United S...,US Treasury Yields Below Recent Highs United S...,neutral,0.600578
9,Wells Fargo earnings above expectations at 1.2...,Earnings,15/07/2023,Wells Fargo earnings above expectations at 1.2...,Wells Fargo earnings above expectations at 1.2...,negative,0.550157
