## 1. Setup and Imports

In [17]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# TensorFlow/Keras
import tensorflow as tf
from tensorflow import keras

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

# Check for GPU
gpus = tf.config.list_physical_devices('GPU')
print(f"GPUs available: {len(gpus)}")
if gpus:
    print(f"GPU: {gpus[0].name}")
else:
    print("Training will use CPU (this is fine for our model)")

TensorFlow version: 2.20.0
Keras version: 3.11.3
GPUs available: 1
GPU: /physical_device:GPU:0


In [18]:
# Add project root to path
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import our classifier module
from src.document_classifier import DocumentClassifier

print(f"Project root: {project_root}")

Project root: /home/shyamsridhar/code/NLPFinalProject


## 2. Load Preprocessed Data

In [19]:
# Load section classification data
data_dir = os.path.join(project_root, 'data', 'processed', 'section')

train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
val_df = pd.read_csv(os.path.join(data_dir, 'val.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

print(f"\nTraining label distribution:")
print(train_df['label'].value_counts())

Training samples: 25883
Validation samples: 5547
Test samples: 5547

Training label distribution:
label
section_9A    1400
section_3     1400
section_1     1400
section_5     1400
section_13    1400
section_2     1400
section_8     1400
section_12    1400
section_7A    1400
section_15    1400
section_9     1400
section_6     1400
section_7     1400
section_14    1400
section_11    1400
section_1A    1400
section_10    1400
section_4     1186
section_9B     579
section_1B     318
Name: count, dtype: int64


In [20]:
# Prepare data
X_train = train_df['text'].tolist()
y_train = train_df['label'].tolist()

X_val = val_df['text'].tolist()
y_val = val_df['label'].tolist()

X_test = test_df['text'].tolist()
y_test = test_df['label'].tolist()

print(f"Sample text length: {len(X_train[0])} characters")
print(f"Sample label: {y_train[0]}")

Sample text length: 1197 characters
Sample label: section_9A


## 3. Build and Train Keras Model

Our custom `DocumentClassifier` uses:
- TF-IDF vectorization (3000 features, unigrams + bigrams)
- Dense neural network with 3 hidden layers
- Batch normalization and dropout for regularization
- Adam optimizer with learning rate scheduling
- Early stopping to prevent overfitting

In [21]:
# Initialize classifier
classifier = DocumentClassifier(max_features=3000)
print(f"Classifier initialized with max_features={classifier.max_features}")

Classifier initialized with max_features=3000


In [22]:
# Train the model
print("="*60)
print("TRAINING KERAS NEURAL NETWORK")
print("="*60)

history = classifier.train(
    texts=X_train,
    labels=y_train,
    epochs=20,
    validation_split=0.15,  # Use 15% of training for validation during training
    batch_size=32,
    early_stopping=True
)

print("\nTraining complete!")

TRAINING KERAS NEURAL NETWORK
Preprocessing 25883 documents...
Fitting TF-IDF vectorizer...
Encoding labels...
Building model with input dimension: 3000


Training model...
Epoch 1/20
[1m688/688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18ms/step - accuracy: 0.9030 - loss: 0.3891 - val_accuracy: 0.9776 - val_loss: 0.0768 - learning_rate: 0.0010
Epoch 2/20
[1m688/688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.9711 - loss: 0.1064 - val_accuracy: 0.9833 - val_loss: 0.0678 - learning_rate: 0.0010
Epoch 3/20
[1m688/688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.9817 - loss: 0.0707 - val_accuracy: 0.9830 - val_loss: 0.0761 - learning_rate: 0.0010
Epoch 4/20
[1m687/688[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.9839 - loss: 0.0540
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m688/688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.9846 - loss: 0.0545 - val_accuracy: 0.9851 - val_loss: 0.0749 - learning_rate: 0.0010
Epoch 5/20
[1m688/688[0m [32m━━━━━━━━━━━━━━━

## 4. Visualize Training History

In [24]:
# Plot training history (simplified)
%matplotlib inline

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(history.history['accuracy'], label='Train')
axes[0].plot(history.history['val_accuracy'], label='Val')
axes[0].set_title('Accuracy')
axes[0].legend()

axes[1].plot(history.history['loss'], label='Train')
axes[1].plot(history.history['val_loss'], label='Val')
axes[1].set_title('Loss')
axes[1].legend()

plt.tight_layout()
plt.savefig(os.path.join(project_root, 'docs', 'training_history.png'))
plt.close()  # Close instead of show to avoid hanging

print(f"Plot saved to docs/training_history.png")
print(f"Final training accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")

Plot saved to docs/training_history.png
Final training accuracy: 0.9963
Final validation accuracy: 0.9897


## 5. Evaluate on Test Set

In [25]:
# Evaluate on test set
print("Evaluating on test set...")

test_metrics = classifier.evaluate(X_test, y_test)
print(f"\nTest Loss: {test_metrics['loss']:.4f}")
print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")

Evaluating on test set...

Test Loss: 0.0676
Test Accuracy: 0.9877


In [26]:
# Get predictions for detailed analysis
predictions = classifier.predict_batch(X_test)
y_pred = [pred[0] for pred in predictions]
confidences = [pred[1] for pred in predictions]

# Classification report
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, target_names=classifier.classes))


CLASSIFICATION REPORT
              precision    recall  f1-score   support

   section_1       0.98      0.97      0.98       300
  section_10       1.00      0.98      0.99       300
  section_11       1.00      0.99      0.99       300
  section_12       0.99      1.00      1.00       300
  section_13       0.99      0.99      0.99       300
  section_14       0.99      0.98      0.98       300
  section_15       0.97      0.99      0.98       300
  section_1A       0.98      0.99      0.99       300
  section_1B       1.00      1.00      1.00        68
   section_2       0.99      0.99      0.99       300
   section_3       0.99      0.99      0.99       300
   section_4       0.97      0.98      0.97       255
   section_5       0.99      0.99      0.99       300
   section_6       0.99      0.99      0.99       300
   section_7       0.97      0.97      0.97       300
  section_7A       1.00      1.00      1.00       300
   section_8       0.98      0.99      0.98       300
   s

In [27]:
# Confusion matrix (simplified - save only, no display)
cm = confusion_matrix(y_test, y_pred, labels=classifier.classes)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=classifier.classes, yticklabels=classifier.classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(os.path.join(project_root, 'docs', 'confusion_matrix.png'))
plt.close()

print("Confusion matrix saved to docs/confusion_matrix.png")

Confusion matrix saved to docs/confusion_matrix.png


In [28]:
# Analyze prediction confidence (no plot - just stats)
print("Prediction Confidence Analysis:")
print(f"  Mean confidence: {np.mean(confidences):.2%}")
print(f"  Min confidence: {np.min(confidences):.2%}")
print(f"  Max confidence: {np.max(confidences):.2%}")

Prediction Confidence Analysis:
  Mean confidence: 99.56%
  Min confidence: 37.25%
  Max confidence: 100.00%


## 6. Example Predictions

In [29]:
# Show some example predictions
print("\n" + "="*60)
print("EXAMPLE PREDICTIONS")
print("="*60)

for i in range(min(5, len(X_test))):
    pred_class, confidence = predictions[i]
    true_class = y_test[i]
    status = "✓" if pred_class == true_class else "✗"
    
    print(f"\n[{i+1}] {status}")
    print(f"    Text preview: {X_test[i][:100]}...")
    print(f"    True: {true_class} | Predicted: {pred_class} | Confidence: {confidence:.2%}")


EXAMPLE PREDICTIONS

[1] ✓
    Text preview: Item 6. Selected Financial Data The table that follows presents selected financial data for each of ...
    True: section_6 | Predicted: section_6 | Confidence: 100.00%

[2] ✓
    Text preview: Item 11. Executive Compensation The information required by this item is incorporated by reference t...
    True: section_11 | Predicted: section_11 | Confidence: 100.00%

[3] ✓
    Text preview: Item 3. Legal Proceedings. For additional information regarding our legal proceedings, see Note 13, ...
    True: section_3 | Predicted: section_3 | Confidence: 100.00%

[4] ✓
    Text preview: ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK Information concerning market ri...
    True: section_7A | Predicted: section_7A | Confidence: 100.00%

[5] ✓
    Text preview: ITEM 9A. CONTROLS AND PROCEDURES. Disclosure Controls and Procedures Our management has evaluated, u...
    True: section_9A | Predicted: section_9A | Confidence: 100.00%


In [30]:
# Show class probabilities for one example
print("\nDetailed probability breakdown for first test document:")
probs = classifier.predict_proba(X_test[0])
for cls, prob in sorted(probs.items(), key=lambda x: -x[1]):
    bar = "█" * int(prob * 40)
    print(f"  {cls}: {prob:6.2%} {bar}")


Detailed probability breakdown for first test document:
  section_6: 100.00% ███████████████████████████████████████
  section_4:  0.00% 
  section_3:  0.00% 
  section_7:  0.00% 
  section_1B:  0.00% 
  section_2:  0.00% 
  section_10:  0.00% 
  section_1:  0.00% 
  section_5:  0.00% 
  section_8:  0.00% 
  section_11:  0.00% 
  section_14:  0.00% 
  section_9:  0.00% 
  section_7A:  0.00% 
  section_15:  0.00% 
  section_1A:  0.00% 
  section_13:  0.00% 
  section_9A:  0.00% 
  section_12:  0.00% 
  section_9B:  0.00% 


## 7. Save Trained Model

In [31]:
# Save the trained model
models_dir = os.path.join(project_root, 'models')
os.makedirs(models_dir, exist_ok=True)

classifier.save(models_dir)

print(f"\nModel saved to {models_dir}")
print("Files created:")
for f in os.listdir(models_dir):
    size = os.path.getsize(os.path.join(models_dir, f)) / 1024
    print(f"  - {f}: {size:.1f} KB")

Model saved to /home/shyamsridhar/code/NLPFinalProject/models

Model saved to /home/shyamsridhar/code/NLPFinalProject/models
Files created:
  - classifier_model.keras: 9560.3 KB
  - vectorizer.joblib: 119.5 KB
  - classes.joblib: 1.6 KB
  - label_encoder.joblib: 1.1 KB


In [32]:
# Verify model can be loaded
print("\nVerifying model loading...")
test_classifier = DocumentClassifier()
test_classifier.load(models_dir)

# Quick test
test_pred, test_conf = test_classifier.predict(X_test[0])
print(f"Test prediction: {test_pred} (confidence: {test_conf:.2%})")
print("Model loading verified!")


Verifying model loading...
Model loaded from /home/shyamsridhar/code/NLPFinalProject/models
Test prediction: section_6 (confidence: 100.00%)
Model loading verified!


## 8. Model Summary

In [33]:
# Final summary
print("\n" + "="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)
print(f"\nModel Architecture:")
print(f"  - Input: TF-IDF vectors ({classifier.max_features} features)")
print(f"  - Hidden layers: 256 → 128 → 64 neurons")
print(f"  - Regularization: BatchNorm + Dropout")
print(f"  - Output: {len(classifier.classes)} classes (softmax)")

print(f"\nTraining Configuration:")
print(f"  - Optimizer: Adam (lr=0.001)")
print(f"  - Loss: Sparse Categorical Crossentropy")
print(f"  - Early stopping: Patience=3")
print(f"  - Training samples: {len(X_train)}")

print(f"\nPerformance:")
print(f"  - Test Accuracy: {test_metrics['accuracy']:.2%}")
print(f"  - Mean Confidence: {np.mean(confidences):.2%}")

print(f"\nClasses: {classifier.classes}")
print(f"\nModel saved to: {models_dir}")
print("\n" + "="*60)
print("Ready to use with app.py dashboard!")
print("="*60)


MODEL TRAINING SUMMARY

Model Architecture:
  - Input: TF-IDF vectors (3000 features)
  - Hidden layers: 256 → 128 → 64 neurons
  - Regularization: BatchNorm + Dropout
  - Output: 20 classes (softmax)

Training Configuration:
  - Optimizer: Adam (lr=0.001)
  - Loss: Sparse Categorical Crossentropy
  - Early stopping: Patience=3
  - Training samples: 25883

Performance:
  - Test Accuracy: 98.77%
  - Mean Confidence: 99.56%

Classes: [np.str_('section_1'), np.str_('section_10'), np.str_('section_11'), np.str_('section_12'), np.str_('section_13'), np.str_('section_14'), np.str_('section_15'), np.str_('section_1A'), np.str_('section_1B'), np.str_('section_2'), np.str_('section_3'), np.str_('section_4'), np.str_('section_5'), np.str_('section_6'), np.str_('section_7'), np.str_('section_7A'), np.str_('section_8'), np.str_('section_9'), np.str_('section_9A'), np.str_('section_9B')]

Model saved to: /home/shyamsridhar/code/NLPFinalProject/models

Ready to use with app.py dashboard!
