In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import logging
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import yaml

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
# Load configuration from YAML file
config_path = '../config/config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Paths from config
processed_data_path = config['data']['processed_data_path']
model_path = config['model']['output_path']

# Load the trained model
logging.info(f"Loading model from {model_path}")
model = joblib.load(model_path)

# Load the processed data
logging.info(f"Loading processed data from {processed_data_path}")
data = pd.read_csv(processed_data_path)
data.head()


In [None]:
# Define feature and label columns
feature_col = 'text'
label_col = 'label'  # Ensure this matches the column in your processed data

# Extract features and labels
X = data[feature_col]
y = data[label_col]

# Split the data into training and testing sets
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the test datasets
print(f"Test data shape: {X_test.shape}")


In [None]:
# Predict on the test set
logging.info("Generating predictions on the test set...")
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(conf_matrix, classes=['Class 0', 'Class 1'])


In [None]:
# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Analyze misclassified examples
misclassified = X_test[y_test != y_pred]
misclassified_results = pd.DataFrame({'text': misclassified, 'predicted': y_pred[y_test != y_pred], 'actual': y_test[y_test != y_pred]})
misclassified_results.head(10)


In [None]:
# Conclusion and Recommendations

This notebook provided an in-depth analysis of the machine learning model's performance on SEC filings data. The results highlight both the strengths and weaknesses of the current model.

### Key Findings
- The model achieved an accuracy of **X%** on the test set.
- The confusion matrix indicates that **Y%** of class 0 samples and **Z%** of class 1 samples were correctly classified.
- The ROC curve shows an AUC of **W**.

### Recommendations for Improvement
1. **Feature Engineering**: Explore additional features beyond simple text-based features to enhance model performance.
2. **Hyperparameter Tuning**: Perform hyperparameter optimization to find the best settings for the classifier.
3. **Model Exploration**: Try different classifiers (e.g., Random Forest, Gradient Boosting) or ensemble methods to improve performance.
4. **Data Augmentation**: Consider augmenting the dataset with more examples to improve model generalization.

These steps can lead to a more robust model that better captures the complexities of SEC filings and improves classification accuracy.
