In [None]:
pip install -r requirements.txt 

In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import glob

# Step 1: Data Extraction and Preprocessing

In [3]:
# Function to extract tables from an HTML file
def extract_tables_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    
    tables = []
    for table in soup.find_all('table'):
        df = pd.read_html(str(table))[0]
        tables.append(df)
    
    return tables

In [None]:
# Extract tables from all files and store them in a dictionary
all_data = {}
for category, directory in categories_dirs.items():
    all_data[category] = []
    for file_path in glob.glob(os.path.join(directory, '*.html')):
        tables = extract_tables_from_html(file_path)
        all_data[category].extend(tables)

In [4]:
# Directory paths
base_dir = 'data/'
categories_dirs = {
    'Income Statement': os.path.join(base_dir, 'Income Statement'),
    'Balance Sheets': os.path.join(base_dir, 'Balance Sheets'),
    'Cash Flow': os.path.join(base_dir, 'Cash Flow'),
    'Notes': os.path.join(base_dir, 'Notes'),
    'Others': os.path.join(base_dir, 'Others')
}

In [6]:
# Display the number of tables extracted for each category
for category, tables in all_data.items():
    print(f'{category}: {len(tables)} tables extracted')

Income Statement: 305 tables extracted
Balance Sheets: 270 tables extracted
Cash Flow: 36 tables extracted
Notes: 690 tables extracted
Others: 1224 tables extracted


# Step 2: Feature Extraction

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all tables into a single list of strings
all_tables = []
all_labels = []
for category, tables in all_data.items():
    for table in tables:
        all_tables.append(table.to_string())
        all_labels.append(category)

# Fit a single TfidfVectorizer on the combined text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(all_tables)
y = np.array(all_labels)


# Step 3: Model Selection and Training


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.9287128712871288
Classification Report:
                  precision    recall  f1-score   support

  Balance Sheets       1.00      0.91      0.95        58
       Cash Flow       1.00      0.88      0.93         8
Income Statement       1.00      0.88      0.93        57
           Notes       0.93      0.85      0.89       123
          Others       0.90      0.98      0.94       259

        accuracy                           0.93       505
       macro avg       0.97      0.90      0.93       505
    weighted avg       0.93      0.93      0.93       505



# Test the model

In [13]:
# Function to preprocess new test data and make predictions
def preprocess_and_predict_new_data(new_data_dir, model, vectorizer):
    new_data = []
    new_labels = []
    categories = os.listdir(new_data_dir)
    
    for category in categories:
        category_dir = os.path.join(new_data_dir, category)
        if os.path.isdir(category_dir):
            for file_path in glob.glob(os.path.join(category_dir, '*.html')):
                tables = extract_tables_from_html(file_path)
                for table in tables:
                    new_data.append(table.to_string())
                    new_labels.append(category)
    
    # Transform new data using the existing TfidfVectorizer
    X_new = vectorizer.transform(new_data)
    y_new = np.array(new_labels)
    
    # Predict using the trained model
    y_new_pred = model.predict(X_new)
    
    # Evaluate predictions
    new_accuracy = accuracy_score(y_new, y_new_pred)
    new_report = classification_report(y_new, y_new_pred)
    
    return new_accuracy, new_report


In [None]:
# Directory of new test data
new_test_data_dir = 'data/'

# Preprocess and predict on new test data
new_accuracy, new_report = preprocess_and_predict_new_data(new_test_data_dir, clf, vectorizer)


In [15]:

print(f'New Test Data Accuracy: {new_accuracy}')
print(f'New Test Data Classification Report:\n{new_report}')

New Test Data Accuracy: 0.9857425742574257
New Test Data Classification Report:
                  precision    recall  f1-score   support

  Balance Sheets       1.00      0.98      0.99       270
       Cash Flow       1.00      0.97      0.99        36
Income Statement       1.00      0.98      0.99       305
           Notes       0.99      0.97      0.98       690
          Others       0.98      1.00      0.99      1224

        accuracy                           0.99      2525
       macro avg       0.99      0.98      0.99      2525
    weighted avg       0.99      0.99      0.99      2525

