In [5]:
# Preprocess Data
def preprocess_data(df):
    """Preprocess the dataset for training."""
    # Remove rows with missing case_text or fill missing case_text with an empty string
    df['case_text'] = df['case_text'].fillna('')  # Fills NaN with an empty string
    X = df['case_text']  # Text of the case
    y = df['case_outcome']  # Labels
    return X, y


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

# File to store the best accuracy
BEST_ACCURACY_FILE = "best_accuracy.txt"

# Load Dataset
def load_dataset(csv_file):
    """Load legal case data from a CSV file."""
    df = pd.read_csv(csv_file)
    return df

# Preprocess Data
def preprocess_data(df):
    """Preprocess the dataset for training."""
    # Remove rows with missing case_text or fill missing case_text with an empty string
    df['case_text'] = df['case_text'].fillna('')  # Fills NaN with an empty string
    X = df['case_text']  # Text of the case
    y = df['case_outcome']  # Labels
    return X, y

# Define Model
def create_model():
    """Create and define the Logistic Regression model."""
    model = LogisticRegression(max_iter=1000, random_state=42)
    return model

# Save the Best Model
def save_best_model(model, vectorizer, accuracy):
    """Save the model and vectorizer if accuracy is higher than the previous best."""
    # Check if the best accuracy file exists
    if os.path.exists(BEST_ACCURACY_FILE):
        with open(BEST_ACCURACY_FILE, "r") as f:
            best_accuracy = float(f.read().strip())
    else:
        best_accuracy = 0.0

    # Compare and save if current accuracy is better
    if accuracy > best_accuracy:
        joblib.dump(model, 'legal_case_classifier.pkl')
        joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
        with open(BEST_ACCURACY_FILE, "w") as f:
            f.write(str(accuracy))
        print(f"New best model saved with accuracy: {accuracy}")
    else:
        print(f"Model not saved. Current accuracy: {accuracy}, Best accuracy: {best_accuracy}")

# Train Model
def train_model(X, y):
    """Train a classification model."""
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Initialize and train the Logistic Regression model
    model = create_model()
    model.fit(X_train_tfidf, y_train)
    
    # Evaluate Model
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Performance:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy)
    
    # Save the best model
    save_best_model(model, vectorizer, accuracy)

    return model, vectorizer

# Load Pre-trained Model and Make Predictions
def predict_case(case_text):
    """Predict judgment based on case text."""
    model = joblib.load('legal_case_classifier.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    
    # Transform the input case text into TF-IDF features
    case_tfidf = vectorizer.transform([case_text])
    
    # Predict and return the result
    prediction = model.predict(case_tfidf)
    return prediction[0]

# Main
if __name__ == "__main__":
    # Load the dataset
    dataset = load_dataset("legal_text_classification1.csv")  # Replace with your dataset file
    X, y = preprocess_data(dataset)
    
    # Train the model
    model, vectorizer = train_model(X, y)
    
    # Test prediction (optional)
    test_case = "The supplier failed to deliver the goods on time as per the contract."
    result = predict_case(test_case)
    print("Prediction for the test case:", result)


Model Performance:
               precision    recall  f1-score   support

     affirmed       0.50      0.09      0.16        32
      applied       0.35      0.09      0.14       515
     approved       0.00      0.00      0.00        19
        cited       0.55      0.91      0.69      2457
   considered       0.31      0.06      0.10       324
    discussed       0.34      0.06      0.10       205
distinguished       0.62      0.04      0.08       122
     followed       0.43      0.12      0.19       436
  referred to       0.47      0.31      0.38       859
      related       1.00      0.04      0.07        28

     accuracy                           0.53      4997
    macro avg       0.46      0.17      0.19      4997
 weighted avg       0.48      0.53      0.45      4997

Accuracy: 0.528717230338203
New best model saved with accuracy: 0.528717230338203
Prediction for the test case: cited


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
