# Baseline Model Development
This notebook implements a TF-IDF + SVM baseline for contract clause classification using the LegalBench CUAD dataset.

In [None]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Load and Preprocess Dataset

In [None]:
# Load dataset
dataset = load_dataset("nguha/legalbench", "cuad_audit_rights", trust_remote_code=True)
df = pd.DataFrame(dataset['test'])  # Using test split for demo; adjust as needed
df['cleaned_text'] = df['text'].apply(lambda x: x.strip().lower())
df['label'] = df['answer'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42)
print(f'Training set size: {len(X_train)}, Test set size: {len(X_test)}')

TF-IDF Vectorization and SVM Training

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train SVM
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_tfidf, y_train)

Evaluate SVM and Visualize

In [None]:
# Evaluate SVM
y_pred = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

Save Model and Vectorizer

In [None]:
# Save model and vectorizer
import joblib
joblib.dump(svm, 'svm_baseline_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')