# Data Preparation for Financial Document Analysis

This notebook loads the `JanosAudran/financial-reports-sec` dataset and prepares data for:
1. **Section Classification** - Classify text into 20 SEC filing sections
2. **Sentiment Classification** - Positive/Negative based on market returns

In [15]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root: {project_root}")

Project root: /home/shyamsridhar/code/NLPFinalProject


## 2. Load Dataset (large_full config)

In [16]:
# Download JSONL files directly (bypasses deprecated dataset scripts)
from huggingface_hub import hf_hub_download
import json

print("Loading large_full dataset by downloading JSONL files directly...")
repo_id = "JanosAudran/financial-reports-sec"

# Download train shards from large config
files = [f"data/large/train/shard_{i}.jsonl" for i in range(10)]

all_records = []
for file_path in files:
    print(f"Downloading: {file_path}", end=" ")
    local_path = hf_hub_download(repo_id=repo_id, filename=file_path, repo_type="dataset")
    
    with open(local_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                record = json.loads(line)
                # Each record is a company with filings
                cik = record.get('cik')
                company = record.get('name', '')
                
                for filing in record.get('filings', []):
                    report = filing.get('report', {})
                    filing_date = filing.get('filingDate', '')
                    
                    # Extract each section separately
                    if isinstance(report, dict):
                        for section_key, section_content in report.items():
                            if isinstance(section_content, list):
                                text = ' '.join(section_content)
                            else:
                                text = str(section_content) if section_content else ""
                            
                            if len(text) > 100:  # Skip empty/tiny sections
                                all_records.append({
                                    'cik': cik,
                                    'company': company,
                                    'section': section_key,
                                    'text': text[:5000],  # Truncate for memory
                                    'filingDate': filing_date
                                })
    
    print(f"✓ ({len(all_records)} total sections)")
    
    # Stop after we have enough data (50K+ sections)
    if len(all_records) >= 50000:
        break

df = pd.DataFrame(all_records)
print(f"\n{'='*60}")
print(f"Loaded {len(df)} section records")
print(f"Columns: {df.columns.tolist()}")
print(f"\nSection distribution:")
print(df['section'].value_counts())

Loading large_full dataset by downloading JSONL files directly...
Downloading: data/large/train/shard_0.jsonl ✓ (81810 total sections)

Loaded 81810 section records
Columns: ['cik', 'company', 'section', 'text', 'filingDate']

Section distribution:
section
section_5     5031
section_7     5004
section_8     4948
section_9     4940
section_14    4890
section_1     4857
section_2     4850
section_12    4847
section_10    4775
section_13    4705
section_11    4593
section_3     4593
section_15    4483
section_9A    4366
section_7A    4192
section_6     4010
section_1A    3749
section_4     1695
section_9B     828
section_1B     454
Name: count, dtype: int64


In [17]:
# Data already loaded as df - just show summary
print(f"Total records: {len(df)}")
print(f"\nUnique companies: {df['cik'].nunique()}")
print(f"Unique sections: {df['section'].nunique()}")
print(f"\nSample sections: {df['section'].unique()[:10].tolist()}")

Total records: 81810

Unique companies: 375
Unique sections: 20

Sample sections: ['section_1', 'section_10', 'section_11', 'section_12', 'section_13', 'section_14', 'section_15', 'section_1A', 'section_2', 'section_3']


In [18]:
# Examine a sample record
print("Sample record:")
sample = df.iloc[0]
for col in df.columns:
    val = sample[col]
    if isinstance(val, str) and len(val) > 100:
        print(f"  {col}: {val[:100]}...")
    else:
        print(f"  {col}: {val}")

Sample record:
  cik: 0000001750
  company: AAR CORP
  section: section_1
  text: ITEM 1.BUSINESS General AAR CORP. and its subsidiaries are referred to herein collectively as “AAR,”...
  filingDate: 2020-07-21


## 3. Prepare Section Classification Data

The dataset has 20 sections: section_1, section_1A, section_7, etc.

In [19]:
# Check section distribution
print("Section distribution:")
print(df['section'].value_counts())

Section distribution:
section
section_5     5031
section_7     5004
section_8     4948
section_9     4940
section_14    4890
section_1     4857
section_2     4850
section_12    4847
section_10    4775
section_13    4705
section_11    4593
section_3     4593
section_15    4483
section_9A    4366
section_7A    4192
section_6     4010
section_1A    3749
section_4     1695
section_9B     828
section_1B     454
Name: count, dtype: int64


In [20]:
# Create section classification dataset
section_df = df[['text', 'section']].copy()
section_df.columns = ['text', 'label']

# Remove short texts
section_df = section_df[section_df['text'].str.len() > 100]

print(f"Section classification samples: {len(section_df)}")
print(f"\nLabel distribution:")
print(section_df['label'].value_counts())

Section classification samples: 81810

Label distribution:
label
section_5     5031
section_7     5004
section_8     4948
section_9     4940
section_14    4890
section_1     4857
section_2     4850
section_12    4847
section_10    4775
section_13    4705
section_11    4593
section_3     4593
section_15    4483
section_9A    4366
section_7A    4192
section_6     4010
section_1A    3749
section_4     1695
section_9B     828
section_1B     454
Name: count, dtype: int64


In [21]:
# Balance the section dataset (sample equal per class)
min_per_class = 500
max_per_class = 2000

balanced_sections = []
for label in section_df['label'].unique():
    class_df = section_df[section_df['label'] == label]
    n_samples = min(max(len(class_df), min_per_class), max_per_class)
    n_samples = min(n_samples, len(class_df))
    sampled = class_df.sample(n=n_samples, random_state=42)
    balanced_sections.append(sampled)

section_balanced = pd.concat(balanced_sections, ignore_index=True)
print(f"Balanced section dataset: {len(section_balanced)} samples")
print(section_balanced['label'].value_counts())

Balanced section dataset: 36977 samples
label
section_1     2000
section_10    2000
section_11    2000
section_12    2000
section_13    2000
section_14    2000
section_15    2000
section_1A    2000
section_2     2000
section_3     2000
section_5     2000
section_6     2000
section_9     2000
section_7     2000
section_7A    2000
section_8     2000
section_9A    2000
section_4     1695
section_9B     828
section_1B     454
Name: count, dtype: int64


## 4. Prepare Sentiment Classification Data

Labels based on market returns: positive (stock went up) or negative (stock went down)

In [22]:
# NOTE: The JSONL files don't have sentiment labels
# Sentiment labels are in the processed configs (large_full, small_full)
# which require the deprecated dataset script
# 
# For now, we'll create a simple sentiment proxy:
# - Use financial keyword analysis to assign pseudo-sentiment labels
# - This is a demonstration - in production you'd use actual market data

import re

def assign_sentiment(text):
    """Assign pseudo-sentiment based on financial keywords."""
    text_lower = text.lower()
    
    positive_words = ['growth', 'increase', 'profit', 'gain', 'success', 'strong', 
                      'improvement', 'exceeded', 'favorable', 'positive', 'higher']
    negative_words = ['loss', 'decline', 'decrease', 'risk', 'impairment', 'adverse',
                      'challenging', 'weakness', 'lower', 'negative', 'difficult']
    
    pos_count = sum(1 for w in positive_words if w in text_lower)
    neg_count = sum(1 for w in negative_words if w in text_lower)
    
    if pos_count > neg_count:
        return 'positive'
    elif neg_count > pos_count:
        return 'negative'
    else:
        return None  # Neutral - exclude

# Apply to a subset of the data
sentiment_sample = df.sample(n=min(20000, len(df)), random_state=42).copy()
sentiment_sample['sentiment'] = sentiment_sample['text'].apply(assign_sentiment)
sentiment_df = sentiment_sample.dropna(subset=['sentiment'])
sentiment_df = sentiment_df[['text', 'sentiment']]
sentiment_df.columns = ['text', 'label']

print(f"Sentiment classification samples: {len(sentiment_df)}")
print(f"\nLabel distribution:")
print(sentiment_df['label'].value_counts())

Sentiment classification samples: 7184

Label distribution:
label
negative    4383
positive    2801
Name: count, dtype: int64


In [23]:
# Balance sentiment dataset
min_count = sentiment_df['label'].value_counts().min()
target = min(min_count, 5000)

balanced_sentiment = []
for label in ['positive', 'negative']:
    class_df = sentiment_df[sentiment_df['label'] == label]
    n = min(target, len(class_df))
    if n > 0:
        sampled = class_df.sample(n=n, random_state=42)
        balanced_sentiment.append(sampled)

sentiment_balanced = pd.concat(balanced_sentiment, ignore_index=True)
print(f"Balanced sentiment dataset: {len(sentiment_balanced)} samples")
print(sentiment_balanced['label'].value_counts())

Balanced sentiment dataset: 5602 samples
label
positive    2801
negative    2801
Name: count, dtype: int64


## 5. Create Train/Val/Test Splits

In [24]:
def create_splits(df, name):
    """Create 70/15/15 train/val/test splits."""
    X = df['text'].tolist()
    y = df['label'].tolist()
    
    # 70% train, 30% temp
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # 50/50 split of temp for val/test
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )
    
    print(f"\n{name} splits:")
    print(f"  Train: {len(X_train)}")
    print(f"  Val: {len(X_val)}")
    print(f"  Test: {len(X_test)}")
    
    return {
        'train': pd.DataFrame({'text': X_train, 'label': y_train}),
        'val': pd.DataFrame({'text': X_val, 'label': y_val}),
        'test': pd.DataFrame({'text': X_test, 'label': y_test})
    }

section_splits = create_splits(section_balanced, 'Section Classification')
sentiment_splits = create_splits(sentiment_balanced, 'Sentiment Classification')


Section Classification splits:
  Train: 25883
  Val: 5547
  Test: 5547

Sentiment Classification splits:
  Train: 3921
  Val: 840
  Test: 841


## 6. Save Processed Data

In [25]:
# Save section classification data
section_dir = os.path.join(project_root, 'data', 'processed', 'section')
os.makedirs(section_dir, exist_ok=True)

section_splits['train'].to_csv(os.path.join(section_dir, 'train.csv'), index=False)
section_splits['val'].to_csv(os.path.join(section_dir, 'val.csv'), index=False)
section_splits['test'].to_csv(os.path.join(section_dir, 'test.csv'), index=False)

print(f"Saved section classification data to {section_dir}")

Saved section classification data to /home/shyamsridhar/code/NLPFinalProject/data/processed/section


In [26]:
# Save sentiment classification data
sentiment_dir = os.path.join(project_root, 'data', 'processed', 'sentiment')
os.makedirs(sentiment_dir, exist_ok=True)

sentiment_splits['train'].to_csv(os.path.join(sentiment_dir, 'train.csv'), index=False)
sentiment_splits['val'].to_csv(os.path.join(sentiment_dir, 'val.csv'), index=False)
sentiment_splits['test'].to_csv(os.path.join(sentiment_dir, 'test.csv'), index=False)

print(f"Saved sentiment classification data to {sentiment_dir}")

Saved sentiment classification data to /home/shyamsridhar/code/NLPFinalProject/data/processed/sentiment


In [27]:
# Save sample documents for demo
sample_dir = os.path.join(project_root, 'data', 'sample_docs')
os.makedirs(sample_dir, exist_ok=True)

# Save samples from different sections
sections_to_sample = ['section_1', 'section_7', 'section_1A']
for section in sections_to_sample:
    section_data = df[df['section'] == section]
    if len(section_data) > 0:
        sample_text = section_data.iloc[0]['text']
        filename = f'sample_{section}.txt'
        with open(os.path.join(sample_dir, filename), 'w') as f:
            f.write(sample_text[:10000])
        print(f"Saved {filename}")

print(f"\nSample documents saved to {sample_dir}")

Saved sample_section_1.txt
Saved sample_section_7.txt
Saved sample_section_1A.txt

Sample documents saved to /home/shyamsridhar/code/NLPFinalProject/data/sample_docs


## 7. Summary

In [28]:
print("="*60)
print("DATA PREPARATION COMPLETE")
print("="*60)
print(f"\nDataset: JanosAudran/financial-reports-sec (large_full)")
print(f"\n1. SECTION CLASSIFICATION:")
print(f"   Labels: {section_balanced['label'].nunique()} sections")
print(f"   Train: {len(section_splits['train'])} | Val: {len(section_splits['val'])} | Test: {len(section_splits['test'])}")
print(f"\n2. SENTIMENT CLASSIFICATION:")
print(f"   Labels: positive, negative")
print(f"   Train: {len(sentiment_splits['train'])} | Val: {len(sentiment_splits['val'])} | Test: {len(sentiment_splits['test'])}")
print(f"\nData saved to: {os.path.join(project_root, 'data', 'processed')}")
print(f"\nNext step: Run 02_train_classifier.ipynb")

DATA PREPARATION COMPLETE

Dataset: JanosAudran/financial-reports-sec (large_full)

1. SECTION CLASSIFICATION:
   Labels: 20 sections
   Train: 25883 | Val: 5547 | Test: 5547

2. SENTIMENT CLASSIFICATION:
   Labels: positive, negative
   Train: 3921 | Val: 840 | Test: 841

Data saved to: /home/shyamsridhar/code/NLPFinalProject/data/processed

Next step: Run 02_train_classifier.ipynb
