In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("MODEL SETUP AND DOWNLOAD")
print("="*60)
print(f"\nPython version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

MODEL SETUP AND DOWNLOAD

Python version: 3.11.13 (main, Jun  5 2025, 13:12:00) [GCC 11.2.0]
Working directory: /home/shyamsridhar/code/NLPFinalProject/notebooks


## 1. Download FinBERT (Financial Sentiment Analysis)

FinBERT is a BERT model fine-tuned on financial text. It's used for sentiment analysis of SEC filings.

- **Model**: `ProsusAI/finbert`
- **Size**: ~500MB
- **Cache Location**: `~/.cache/huggingface/hub/`

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

MODEL_NAME = "ProsusAI/finbert"

print(f"\n[1/3] Downloading FinBERT model: {MODEL_NAME}")
print("      This may take 2-5 minutes on first run...\n")

# Download tokenizer
print("  â†’ Downloading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("    âœ… Tokenizer cached")

# Download model
print("  â†’ Downloading model weights...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
print("    âœ… Model cached")

# Verify
print(f"\n  Model config:")
print(f"    - Architecture: {model.config.architectures}")
print(f"    - Labels: {model.config.id2label}")
print(f"    - Hidden size: {model.config.hidden_size}")

# Check cache location
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
finbert_cache = os.path.join(cache_dir, "models--ProsusAI--finbert")
if os.path.exists(finbert_cache):
    size_mb = sum(os.path.getsize(os.path.join(dp, f)) for dp, dn, fn in os.walk(finbert_cache) for f in fn) / (1024*1024)
    print(f"\n  Cache location: {finbert_cache}")
    print(f"  Cache size: {size_mb:.1f} MB")

print("\nâœ… FinBERT download complete!")


[1/3] Downloading FinBERT model: ProsusAI/finbert
      This may take 2-5 minutes on first run...

  â†’ Downloading tokenizer...
    âœ… Tokenizer cached
  â†’ Downloading model weights...
    âœ… Model cached

  Model config:
    - Architecture: ['BertForSequenceClassification']
    - Labels: {0: 'positive', 1: 'negative', 2: 'neutral'}
    - Hidden size: 768

  Cache location: /home/shyamsridhar/.cache/huggingface/hub/models--ProsusAI--finbert
  Cache size: 1671.2 MB

âœ… FinBERT download complete!


In [3]:
# Quick test of FinBERT
from transformers import pipeline

print("Testing FinBERT sentiment analysis...\n")

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    truncation=True,
    max_length=512
)

test_texts = [
    "Revenue increased 15% year-over-year, exceeding analyst expectations.",
    "The company reported a significant loss due to supply chain disruptions.",
    "The quarterly dividend was declared at $0.50 per share."
]

for text in test_texts:
    result = sentiment_pipeline(text)[0]
    print(f"  Text: \"{text[:60]}...\"")
    print(f"  â†’ {result['label']}: {result['score']:.2%}\n")

print("âœ… FinBERT is working correctly!")

# Clean up memory
del model, tokenizer, sentiment_pipeline
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Device set to use cuda:0


Testing FinBERT sentiment analysis...

  Text: "Revenue increased 15% year-over-year, exceeding analyst expe..."
  â†’ positive: 95.75%

  Text: "The company reported a significant loss due to supply chain ..."
  â†’ negative: 96.90%

  Text: "The quarterly dividend was declared at $0.50 per share...."
  â†’ neutral: 78.91%

âœ… FinBERT is working correctly!


## 2. Download spaCy Model (Named Entity Recognition)

spaCy's `en_core_web_sm` model is used for extracting named entities like organizations, people, dates, and monetary values.

- **Model**: `en_core_web_sm`
- **Size**: ~12MB

In [4]:
import subprocess
import spacy

print("\n[2/3] Setting up spaCy model: en_core_web_sm")

# Check if already installed
try:
    nlp = spacy.load("en_core_web_sm")
    print("  âœ… Model already installed")
except OSError:
    print("  â†’ Downloading model...")
    subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], 
                   capture_output=True)
    nlp = spacy.load("en_core_web_sm")
    print("  âœ… Model downloaded and installed")

# Verify
print(f"\n  Model info:")
print(f"    - Pipeline: {nlp.pipe_names}")
print(f"    - Vectors: {nlp.vocab.vectors.shape}")

print("\nâœ… spaCy setup complete!")


[2/3] Setting up spaCy model: en_core_web_sm
  âœ… Model already installed

  Model info:
    - Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
    - Vectors: (0, 0)

âœ… spaCy setup complete!


In [5]:
# Quick test of spaCy NER
print("Testing spaCy named entity recognition...\n")

test_text = "Apple Inc. reported revenue of $94.8 billion for Q1 2024. CEO Tim Cook announced expansion plans in Europe."

doc = nlp(test_text)

print(f"  Text: \"{test_text}\"\n")
print("  Entities found:")
for ent in doc.ents:
    print(f"    - {ent.text} ({ent.label_})")

print("\nâœ… spaCy NER is working correctly!")

del nlp, doc

Testing spaCy named entity recognition...

  Text: "Apple Inc. reported revenue of $94.8 billion for Q1 2024. CEO Tim Cook announced expansion plans in Europe."

  Entities found:
    - Apple Inc. (ORG)
    - $94.8 billion (MONEY)
    - Q1 2024 (DATE)
    - Tim Cook (PERSON)
    - Europe (LOC)

âœ… spaCy NER is working correctly!


## 3. Download NLTK Data (Sentence Tokenization)

NLTK's `punkt` tokenizer is used for splitting text into sentences (used by the Forward-Looking Statement Detector).

- **Data**: `punkt`, `punkt_tab`
- **Size**: ~2MB

In [6]:
import nltk

print("\n[3/3] Downloading NLTK data")

# Download punkt tokenizer
print("  â†’ Downloading punkt tokenizer...")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
print("  âœ… NLTK data downloaded")

# Verify
print(f"\n  NLTK data path: {nltk.data.path[0]}")

print("\nâœ… NLTK setup complete!")


[3/3] Downloading NLTK data
  â†’ Downloading punkt tokenizer...
  âœ… NLTK data downloaded

  NLTK data path: /home/shyamsridhar/nltk_data

âœ… NLTK setup complete!


In [7]:
# Quick test of NLTK
print("Testing NLTK sentence tokenization...\n")

test_text = "The company expects revenue to grow 10% next year. We believe this will be driven by new product launches. However, market conditions may affect these projections."

sentences = nltk.sent_tokenize(test_text)

print(f"  Input text: \"{test_text}\"\n")
print(f"  Sentences found: {len(sentences)}")
for i, sent in enumerate(sentences, 1):
    print(f"    {i}. {sent}")

print("\nâœ… NLTK tokenization is working correctly!")

Testing NLTK sentence tokenization...

  Input text: "The company expects revenue to grow 10% next year. We believe this will be driven by new product launches. However, market conditions may affect these projections."

  Sentences found: 3
    1. The company expects revenue to grow 10% next year.
    2. We believe this will be driven by new product launches.
    3. However, market conditions may affect these projections.

âœ… NLTK tokenization is working correctly!


## 4. Verify TensorFlow/Keras (Document Classifier)

The document classifier uses TensorFlow/Keras. No download needed, but let's verify the installation.

In [8]:
import tensorflow as tf
from tensorflow import keras

print("\n[Bonus] Verifying TensorFlow/Keras installation")
print(f"  TensorFlow version: {tf.__version__}")
print(f"  Keras version: {keras.__version__}")

# Check GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"  GPU available: {gpus[0].name}")
else:
    print("  GPU: Not available (CPU mode - this is fine)")

print("\nâœ… TensorFlow/Keras ready!")


[Bonus] Verifying TensorFlow/Keras installation
  TensorFlow version: 2.20.0
  Keras version: 3.11.3
  GPU available: /physical_device:GPU:0

âœ… TensorFlow/Keras ready!


## 5. Summary

In [9]:
import shutil

print("\n" + "="*60)
print("MODEL SETUP COMPLETE")
print("="*60)

print("\nâœ… All models downloaded and cached:\n")

# Check cache sizes
cache_info = []

# FinBERT
finbert_path = os.path.expanduser("~/.cache/huggingface/hub/models--ProsusAI--finbert")
if os.path.exists(finbert_path):
    size = sum(os.path.getsize(os.path.join(dp, f)) for dp, dn, fn in os.walk(finbert_path) for f in fn) / (1024*1024)
    cache_info.append(("FinBERT (ProsusAI/finbert)", f"{size:.0f} MB", "~/.cache/huggingface/hub/"))

# spaCy
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    cache_info.append(("spaCy (en_core_web_sm)", "~12 MB", "Python site-packages"))
except:
    pass

# NLTK
nltk_path = os.path.expanduser("~/nltk_data")
if os.path.exists(nltk_path):
    cache_info.append(("NLTK (punkt)", "~2 MB", nltk_path))

for name, size, location in cache_info:
    print(f"  ðŸ“¦ {name}")
    print(f"     Size: {size}")
    print(f"     Location: {location}\n")

print("\n" + "-"*60)
print("NEXT STEPS:")
print("-"*60)
print("\n1. Run 01_data_preparation.ipynb to download training data")
print("2. Run 02_train_classifier.ipynb to train the document classifier")
print("3. Run 'python app.py' to launch the dashboard")
print("\n" + "="*60)


MODEL SETUP COMPLETE

âœ… All models downloaded and cached:

  ðŸ“¦ FinBERT (ProsusAI/finbert)
     Size: 1671 MB
     Location: ~/.cache/huggingface/hub/

  ðŸ“¦ spaCy (en_core_web_sm)
     Size: ~12 MB
     Location: Python site-packages

  ðŸ“¦ NLTK (punkt)
     Size: ~2 MB
     Location: /home/shyamsridhar/nltk_data


------------------------------------------------------------
NEXT STEPS:
------------------------------------------------------------

1. Run 01_data_preparation.ipynb to download training data
2. Run 02_train_classifier.ipynb to train the document classifier
3. Run 'python app.py' to launch the dashboard

