In [2]:
import pandas as pd
import spacy
from spacy.training.example import Example
import random

In [8]:
df = pd.read_csv(r"../data/train/merged_train_sentences.csv")


In [9]:
df.head()

Unnamed: 0,Sentences,Category
0,The stock market experienced a significant ris...,financial
1,Investing in mutual funds can be a good way to...,financial
2,The Federal Reserve announced an increase in i...,financial
3,Cryptocurrencies like Bitcoin are highly volat...,financial
4,It's important to have an emergency fund cover...,financial


In [10]:
df

Unnamed: 0,Sentences,Category
0,The stock market experienced a significant ris...,financial
1,Investing in mutual funds can be a good way to...,financial
2,The Federal Reserve announced an increase in i...,financial
3,Cryptocurrencies like Bitcoin are highly volat...,financial
4,It's important to have an emergency fund cover...,financial
...,...,...
3406,Legal prohibitions exist against insider tradi...,legal
3407,Surveillance laws regulate the use of electron...,legal
3408,Legal instruments such as affidavits serve as ...,legal
3409,Public safety laws govern the storage and hand...,legal


In [11]:
import pandas as pd
import random
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
df.columns = df.columns.str.replace('ï»¿', '')


In [13]:
category_mapping = {'healthcare': 0, 'financial': 1, 'legal': 2}
df['Category'] = df['Category'].map(category_mapping)

In [14]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [15]:
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [16]:
train_texts = train_data['Sentences'].tolist()
train_labels = train_data['Category'].tolist()

test_texts = test_data['Sentences'].tolist()
test_labels = test_data['Category'].tolist()

In [17]:
nlp = spacy.blank("en")

In [18]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

In [19]:
categories = [0, 1, 2]
for category in categories:
    textcat.add_label(str(category))

In [20]:
train_data_spacy = []
for text, label in zip(train_texts, train_labels):
    cats = {str(cat): label == cat for cat in categories}
    train_data_spacy.append((text, {"cats": cats}))

In [21]:
n_iter = 30
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

In [22]:
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        losses = {}
        random.shuffle(train_data_spacy)
        batches = minibatch(train_data_spacy, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
            nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
        print(f"Iteration {i+1}: Losses {losses}")

Iteration 1: Losses {'textcat': 49.40380812653811}
Iteration 2: Losses {'textcat': 8.191742793805014}
Iteration 3: Losses {'textcat': 2.779398442162477}
Iteration 4: Losses {'textcat': 0.9411338781785565}
Iteration 5: Losses {'textcat': 1.5730814316157502}
Iteration 6: Losses {'textcat': 0.9220932093946635}
Iteration 7: Losses {'textcat': 0.810689210555406}
Iteration 8: Losses {'textcat': 1.556076085887159}
Iteration 9: Losses {'textcat': 1.0167609167563028}
Iteration 10: Losses {'textcat': 1.1367946250868866}
Iteration 11: Losses {'textcat': 0.8938387533086551}
Iteration 12: Losses {'textcat': 1.3803403032367718}
Iteration 13: Losses {'textcat': 1.1089562642400215}
Iteration 14: Losses {'textcat': 1.2621613832149665}
Iteration 15: Losses {'textcat': 1.0617662477107104}
Iteration 16: Losses {'textcat': 1.2697210776457684}
Iteration 17: Losses {'textcat': 0.786153565616201}
Iteration 18: Losses {'textcat': 0.5662016481094749}
Iteration 19: Losses {'textcat': 0.11901788413824159}
Iterati

In [23]:
output_dir = r"trained_model/"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Model saved to trained_model/


In [24]:
nlp = spacy.load(output_dir)


In [25]:
predicted_labels = []
for text in test_texts:
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)  # Get the label with the highest confidence
    predicted_labels.append(int(predicted_label))

In [26]:
report = classification_report(test_labels, predicted_labels, target_names=['healthcare', 'financial', 'legal'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

  healthcare       0.95      0.98      0.97       198
   financial       0.97      0.98      0.98       250
       legal       0.99      0.94      0.97       235

    accuracy                           0.97       683
   macro avg       0.97      0.97      0.97       683
weighted avg       0.97      0.97      0.97       683



In [27]:
conf_matrix = confusion_matrix(test_labels, predicted_labels, labels=categories)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[195   2   1]
 [  3 246   1]
 [  8   6 221]]


In [28]:
test_csv_path = r'../data/train/merged_train_sentences.csv'  # Update with actual test CSV file path
test_df = pd.read_csv(test_csv_path)

In [29]:
test_texts = test_df['Sentences'].tolist()
test_labels = test_df['Category'].map(category_mapping).tolist()

In [30]:
predicted_labels = []
for text in test_texts:
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)  # Get the label with the highest confidence
    predicted_labels.append(int(predicted_label))

In [31]:
report = classification_report(test_labels, predicted_labels, target_names=['healthcare', 'financial', 'legal'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

  healthcare       0.99      1.00      0.99      1121
   financial       0.99      1.00      0.99      1109
       legal       1.00      0.99      0.99      1181

    accuracy                           0.99      3411
   macro avg       0.99      0.99      0.99      3411
weighted avg       0.99      0.99      0.99      3411



In [32]:
conf_matrix = confusion_matrix(test_labels, predicted_labels, labels=categories)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[1118    2    1]
 [   3 1105    1]
 [  10    6 1165]]


In [33]:
test_csv_path2 = r'../data/test/output.csv'  # Update with actual test CSV file path
test_df2 = pd.read_csv(test_csv_path2)

In [34]:
test_texts = test_df['Sentences'].tolist()
test_labels = test_df['Category'].map(category_mapping).tolist()

In [35]:
predicted_labels = []
for text in test_texts:
    doc = nlp(text)
    predicted_label = max(doc.cats, key=doc.cats.get)  # Get the label with the highest confidence
    predicted_labels.append(int(predicted_label))

In [36]:
report = classification_report(test_labels, predicted_labels, target_names=['healthcare', 'financial', 'legal'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

  healthcare       0.99      1.00      0.99      1121
   financial       0.99      1.00      0.99      1109
       legal       1.00      0.99      0.99      1181

    accuracy                           0.99      3411
   macro avg       0.99      0.99      0.99      3411
weighted avg       0.99      0.99      0.99      3411

