In [1]:
import pandas as pd
import spacy
from spacy.training.example import Example
import random

In [2]:
df = pd.read_csv(r"..\data\train\merged_train_sentences.csv")


In [3]:
df.head()

Unnamed: 0,Sentences,Category
0,The stock market experienced a significant ris...,financial
1,Investing in mutual funds can be a good way to...,financial
2,The Federal Reserve announced an increase in i...,financial
3,Cryptocurrencies like Bitcoin are highly volat...,financial
4,It's important to have an emergency fund cover...,financial


In [4]:
df

Unnamed: 0,Sentences,Category
0,The stock market experienced a significant ris...,financial
1,Investing in mutual funds can be a good way to...,financial
2,The Federal Reserve announced an increase in i...,financial
3,Cryptocurrencies like Bitcoin are highly volat...,financial
4,It's important to have an emergency fund cover...,financial
...,...,...
474,The parties agree to seek a mediator to assist...,legal
475,The vendor must comply with all relevant produ...,legal
476,The user must notify the company of any issues...,legal
477,The company reserves the right to adjust its p...,legal


In [5]:
import pandas as pd
import random
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
df.columns = df.columns.str.replace('ï»¿', '')


In [7]:
category_mapping = {'healthcare': 0, 'financial': 1, 'legal': 2}
df['Category'] = df['Category'].map(category_mapping)

In [8]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [9]:
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [10]:
train_texts = train_data['Sentences'].tolist()
train_labels = train_data['Category'].tolist()

test_texts = test_data['Sentences'].tolist()
test_labels = test_data['Category'].tolist()

In [11]:
nlp = spacy.blank("en")

In [12]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

In [13]:
categories = [0, 1, 2]
for category in categories:
    textcat.add_label(str(category))

In [14]:
train_data_spacy = []
for text, label in zip(train_texts, train_labels):
    cats = {str(cat): label == cat for cat in categories}
    train_data_spacy.append((text, {"cats": cats}))

In [15]:
n_iter = 10
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

In [16]:
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        losses = {}
        random.shuffle(train_data_spacy)
        batches = minibatch(train_data_spacy, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
            nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
        print(f"Iteration {i+1}: Losses {losses}")

Iteration 1: Losses {'textcat': 15.53426352594397}
Iteration 2: Losses {'textcat': 4.653038211175954}
Iteration 3: Losses {'textcat': 1.0417724006226114}
Iteration 4: Losses {'textcat': 0.5893221140246774}
Iteration 5: Losses {'textcat': 0.0005196181831224785}
Iteration 6: Losses {'textcat': 0.19651278397270425}
Iteration 7: Losses {'textcat': 0.006236630314785609}
Iteration 8: Losses {'textcat': 0.10219073613012833}
Iteration 9: Losses {'textcat': 0.000425344640231365}
Iteration 10: Losses {'textcat': 0.005869759906665254}


In [17]:
output_dir = r"..\model_training\trained_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Model saved to ..\model_training\trained_model


In [18]:
nlp = spacy.load(output_dir)


In [19]:
predicted_labels = []
for text in test_texts:
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)  # Get the label with the highest confidence
    predicted_labels.append(int(predicted_label))

In [20]:
report = classification_report(test_labels, predicted_labels, target_names=['healthcare', 'financial', 'legal'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

  healthcare       0.88      1.00      0.93        21
   financial       1.00      0.86      0.93        29
       legal       0.96      0.98      0.97        46

    accuracy                           0.95        96
   macro avg       0.94      0.95      0.94        96
weighted avg       0.95      0.95      0.95        96



In [21]:
conf_matrix = confusion_matrix(test_labels, predicted_labels, labels=categories)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[21  0  0]
 [ 2 25  2]
 [ 1  0 45]]


In [22]:
test_csv_path = r'..\data\train\merged_train_sentences.csv'  # Update with actual test CSV file path
test_df = pd.read_csv(test_csv_path)

In [23]:
test_texts = test_df['Sentences'].tolist()
test_labels = test_df['Category'].map(category_mapping).tolist()

In [24]:
predicted_labels = []
for text in test_texts:
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)  # Get the label with the highest confidence
    predicted_labels.append(int(predicted_label))

In [25]:
report = classification_report(test_labels, predicted_labels, target_names=['healthcare', 'financial', 'legal'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

  healthcare       0.98      1.00      0.99       129
   financial       1.00      0.97      0.98       126
       legal       0.99      1.00      0.99       224

    accuracy                           0.99       479
   macro avg       0.99      0.99      0.99       479
weighted avg       0.99      0.99      0.99       479



In [26]:
conf_matrix = confusion_matrix(test_labels, predicted_labels, labels=categories)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[129   0   0]
 [  2 122   2]
 [  1   0 223]]


In [27]:
test_csv_path2 = r'..\data\test\output.csv'  # Update with actual test CSV file path
test_df2 = pd.read_csv(test_csv_path2)

In [28]:
test_texts = test_df['Sentences'].tolist()
test_labels = test_df['Category'].map(category_mapping).tolist()

In [29]:
predicted_labels = []
for text in test_texts:
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)  # Get the label with the highest confidence
    predicted_labels.append(int(predicted_label))

In [30]:
report = classification_report(test_labels, predicted_labels, target_names=['healthcare', 'financial', 'legal'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

  healthcare       0.98      1.00      0.99       129
   financial       1.00      0.97      0.98       126
       legal       0.99      1.00      0.99       224

    accuracy                           0.99       479
   macro avg       0.99      0.99      0.99       479
weighted avg       0.99      0.99      0.99       479

