In [1]:
!unzip -q data.zip -d data

In [47]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re

def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def extract_text_from_html(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract text from the parsed HTML
    text = soup.get_text(separator=' ')

    # Remove extra spaces, newlines, and tabs
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def process_directory(directory, label):
    data = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.html'):
                file_path = os.path.join(root, file)
                html_content = read_html_file(file_path)
                clean_text = extract_text_from_html(html_content)
                data.append((clean_text, label))
    return data

# Paths to the directories with HTML files and their corresponding labels
directories = {
    '/content/data/data/Balance Sheets': 'Balance Sheet',
    '/content/data/data/Cash Flow': 'Cash Flow',
    '/content/data/data/Income Statement': 'Income Statement',
    '/content/data/data/Notes': 'Notes',
    '/content/data/data/Others': 'Others'
}

# Collect data from all directories
all_data = []
for directory, label in directories.items():
    all_data.extend(process_directory(directory, label))

# Creating a DataFrame
df = pd.DataFrame(all_data, columns=['HTML_Text', 'Target'])

# Saved DataFrame to a CSV file (optional)
df.to_csv('financial_statements.csv', index=False)

# Displaying the first few rows of the DataFrame
print(df.head())


                                           HTML_Text         Target
0  2018 2017 ASSETS Non-Current Assets Property P...  Balance Sheet
1  1 Standalone Consolidated | As at Mar As at Ma...  Balance Sheet
2  Standalone Consolidated Particulars Year Ended...  Balance Sheet
3  As at 31st March, As at 31st March, Statement ...  Balance Sheet
4  March 31,2018 March 31,2017 ASSETS Non-current...  Balance Sheet


In [50]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/financial_statements.csv')

# Function to clean the text
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'\W+', ' ', text)
    # Convert to lower case
    text = text.lower()
    return text

# Apply the clean_text function to the HTML_Text column
df['HTML_Text'] = df['HTML_Text'].apply(clean_text)

# Display the first few rows of the DataFrame
print(df.head())


                                           HTML_Text         Target
0  2018 2017 assets non current assets property p...  Balance Sheet
1  1 standalone consolidated as at mar as at mar ...  Balance Sheet
2  standalone consolidated particulars year ended...  Balance Sheet
3  as at 31st march as at 31st march statement of...  Balance Sheet
4  march 31 2018 march 31 2017 assets non current...  Balance Sheet


In [61]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['HTML_Text'], df['Target'], test_size=0.2, random_state=42)

# Creating a pipeline that combines the TF-IDF vectorizer and a Naive Bayes classifier
pipeline = make_pipeline(TfidfVectorizer(max_features=10000), MultinomialNB())

# Training the model on the training data
pipeline.fit(X_train, y_train)

# Predicting the labels for the test set
y_pred = pipeline.predict(X_test)

# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.8574257425742574
Classification Report:
                  precision    recall  f1-score   support

   Balance Sheet       1.00      0.94      0.97        51
       Cash Flow       0.00      0.00      0.00         6
Income Statement       1.00      0.65      0.79        66
           Notes       0.88      0.74      0.81       123
          Others       0.81      0.97      0.88       259

        accuracy                           0.86       505
       macro avg       0.74      0.66      0.69       505
    weighted avg       0.86      0.86      0.85       505



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Using MNB Model**

In [62]:
import joblib
from bs4 import BeautifulSoup

# Saving the trained model to a file
joblib.dump(pipeline, 'text_classification_model.pkl')


['text_classification_model.pkl']

**Testing the model**

In [75]:

# Loading the saved model from the file (for later use)
loaded_model = joblib.load('text_classification_model.pkl')

# Function to extract text from HTML file
def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to predict the category of HTML files
def predict_category_from_html(file_paths):
    texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        text = extract_text_from_html(html_content)
        cleaned_text = clean_text(text)
        texts.append(cleaned_text)
    predictions = loaded_model.predict(texts)
    return predictions

# Example usage
html_file_paths = [
    "/content/data/data/Balance Sheets/18445487_2.html"]
predicted_categories = predict_category_from_html(html_file_paths)
print(predicted_categories)


['Balance Sheet']


**Using LogisticRegression**

In [65]:
from sklearn.linear_model import LogisticRegression

# Creating a pipeline that combines the TF-IDF vectorizer and a Logistic Regression classifier
pipeline_lr = make_pipeline(TfidfVectorizer(max_features=10000), LogisticRegression())

# Training the model on the training data
pipeline_lr.fit(X_train, y_train)

# Predicting the labels for the test set
y_pred_lr = pipeline_lr.predict(X_test)

# Evaluating the model's performance
accuracy_lr = accuracy_score(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)

print(f'Logistic Regression Accuracy: {accuracy_lr}')
print('Logistic Regression Classification Report:')
print(report_lr)


Logistic Regression Accuracy: 0.9485148514851485
Logistic Regression Classification Report:
                  precision    recall  f1-score   support

   Balance Sheet       0.98      1.00      0.99        51
       Cash Flow       1.00      0.83      0.91         6
Income Statement       0.97      0.94      0.95        66
           Notes       0.98      0.86      0.92       123
          Others       0.92      0.98      0.95       259

        accuracy                           0.95       505
       macro avg       0.97      0.92      0.94       505
    weighted avg       0.95      0.95      0.95       505



**Testing the LR model predictions on a sample**

In [74]:
import joblib
from bs4 import BeautifulSoup

# Saving the trained model to a file
joblib.dump(pipeline, 'text_classification_model1.pkl')

# Loaded the saved model from the file (for later use)
loaded_model = joblib.load('text_classification_model1.pkl')

# Function to clean text
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    text = text.lower()
    return text

# Function to extract text from HTML file
def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to predict the category of HTML files
def predict_category_from_html(file_paths):
    texts = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            text = extract_text_from_html(html_content)
            cleaned_text = clean_text(text)
            texts.append(cleaned_text)
    predictions = loaded_model.predict(texts)
    return predictions

# Example usage
html_file_paths = [
    "/content/data/data/Cash Flow/18599651_table_100.html"
]
predicted_categories = predict_category_from_html(html_file_paths)
print(predicted_categories)


['Cash Flow']


**Best Selected Classification model from different models**

In [77]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re

# Loading the CSV file into a DataFrame
df = pd.read_csv('financial_statements.csv')

# Function to clean the text
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    text = text.lower()
    return text

# Applying the clean_text function to the HTML_Text column
df['HTML_Text'] = df['HTML_Text'].apply(clean_text)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['HTML_Text'], df['Target'], test_size=0.2, random_state=42)

# List of different models to evaluate 
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier()
}
best_model = None
best_accuracy = 0.0


# Evaluating each model
for name, model in models.items():
    pipeline = make_pipeline(TfidfVectorizer(max_features=10000), model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy}')
    print(f'{name} Classification Report:')
    print(report)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Saving the best performing model
if best_model is not None:
    joblib.dump(best_model, 'best_model.pkl')



SVM Accuracy: 0.9584158415841584
SVM Classification Report:
                  precision    recall  f1-score   support

   Balance Sheet       1.00      1.00      1.00        51
       Cash Flow       1.00      0.83      0.91         6
Income Statement       0.98      0.97      0.98        66
           Notes       0.96      0.89      0.92       123
          Others       0.94      0.98      0.96       259

        accuracy                           0.96       505
       macro avg       0.98      0.93      0.95       505
    weighted avg       0.96      0.96      0.96       505

Random Forest Accuracy: 0.9366336633663367
Random Forest Classification Report:
                  precision    recall  f1-score   support

   Balance Sheet       0.98      0.98      0.98        51
       Cash Flow       1.00      0.67      0.80         6
Income Statement       1.00      0.94      0.97        66
           Notes       0.93      0.85      0.89       123
          Others       0.92      0.97      0

**Testing the best selected models prediction by giving 2 different html files**

In [82]:
import joblib
from bs4 import BeautifulSoup
# Loading the saved model from the file (for later use)
loaded_model = joblib.load('best_model.pkl')

# Function to clean text
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    text = text.lower()
    return text

# Function to extract text from HTML file
def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to predict the category of HTML files
def predict_category_from_html(file_paths):
    texts = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            text = extract_text_from_html(html_content)
            cleaned_text = clean_text(text)
            texts.append(cleaned_text)
    predictions = loaded_model.predict(texts)
    return predictions



In [83]:
html_file_paths = [
    "/content/data/data/Income Statement/18448274_1.html",
    "/content/data/data/Notes/18599651_table_109.html"
]
predicted_categories = predict_category_from_html(html_file_paths)
print(predicted_categories)


['Income Statement' 'Notes']
