In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import logging
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import yaml

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
# Load configuration from YAML file
config_path = '../config/config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Paths from config
processed_data_path = config['data']['processed_data_path']
model_output_path = config['model']['output_path']

# Load the preprocessed data
logging.info(f"Loading processed data from {processed_data_path}")
data = pd.read_csv(processed_data_path)

# Display the first few rows of the data
data.head()


In [None]:
# Define feature and label columns
feature_col = 'text'
label_col = 'label'  # Ensure this matches the column in your processed data

# Check if columns exist in the data
if feature_col not in data.columns or label_col not in data.columns:
    raise ValueError(f"Columns '{feature_col}' or '{label_col}' not found in data.")

# Extract features and labels
X = data[feature_col]
y = data[label_col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the datasets
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


In [None]:
# Create a text classification pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('clf', MultinomialNB())
])

# Display the pipeline structure
pipeline


In [None]:
# Train the model
logging.info("Training the model...")
pipeline.fit(X_train, y_train)


In [None]:
# Evaluate the model on the test set
logging.info("Evaluating the model...")
y_pred = pipeline.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


In [None]:
# Save the trained model to a file
joblib.dump(pipeline, model_output_path)
logging.info(f"Model saved to {model_output_path}")


In [None]:
# Conclusion and Next Steps

In this notebook, we have successfully trained a text classification model using the processed SEC filings data. We evaluated the model's performance and saved the trained model for future use.

**Next Steps:**
1. Use the trained model to make predictions on new data.
2. Explore other machine learning models and hyperparameter tuning to improve performance.
3. Integrate the model into a production environment or application.
