In [5]:
# Preprocess Data
def preprocess_data(df):
    """Preprocess the dataset for training."""
    # Remove rows with missing case_text or fill missing case_text with an empty string
    df['case_text'] = df['case_text'].fillna('')  # Fills NaN with an empty string
    X = df['case_text']  # Text of the case
    y = df['case_outcome']  # Labels
    return X, y


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

# File to store the best accuracy
BEST_ACCURACY_FILE = "best_accuracy.txt"

# Load Dataset
def load_dataset(csv_file):
    """Load legal case data from a CSV file."""
    df = pd.read_csv(csv_file)
    return df

# Preprocess Data
def preprocess_data(df):
    """Preprocess the dataset for training."""
    # Remove rows with missing case_text or fill missing case_text with an empty string
    df['case_text'] = df['case_text'].fillna('')  # Fills NaN with an empty string
    X = df['case_text']  # Text of the case
    y = df['case_outcome']  # Labels
    return X, y

# Define Model
def create_model():
    """Create and define the Logistic Regression model."""
    model = LogisticRegression(max_iter=1000, random_state=42)
    return model

# Save the Best Model
def save_best_model(model, vectorizer, accuracy):
    """Save the model and vectorizer if accuracy is higher than the previous best."""
    # Check if the best accuracy file exists
    if os.path.exists(BEST_ACCURACY_FILE):
        with open(BEST_ACCURACY_FILE, "r") as f:
            best_accuracy = float(f.read().strip())
    else:
        best_accuracy = 0.0

    # Compare and save if current accuracy is better
    if accuracy > best_accuracy:
        joblib.dump(model, 'legal_case_classifier.pkl')
        joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
        with open(BEST_ACCURACY_FILE, "w") as f:
            f.write(str(accuracy))
        print(f"New best model saved with accuracy: {accuracy}")
    else:
        print(f"Model not saved. Current accuracy: {accuracy}, Best accuracy: {best_accuracy}")

# Train Model
def train_model(X, y):
    """Train a classification model."""
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Initialize and train the Logistic Regression model
    model = create_model()
    model.fit(X_train_tfidf, y_train)
    
    # Evaluate Model
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Performance:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy)
    
    # Save the best model
    save_best_model(model, vectorizer, accuracy)

    return model, vectorizer

# Load Pre-trained Model and Make Predictions
def predict_case(case_text):
    """Predict judgment based on case text."""
    model = joblib.load('legal_case_classifier.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    
    # Transform the input case text into TF-IDF features
    case_tfidf = vectorizer.transform([case_text])
    
    # Predict and return the result
    prediction = model.predict(case_tfidf)
    return prediction[0]

# Main
if __name__ == "__main__":
    # Load the dataset
    dataset = load_dataset("legal_text_classification1.csv")  # Replace with your dataset file
    X, y = preprocess_data(dataset)
    
    # Train the model
    model, vectorizer = train_model(X, y)
    
    # Test prediction (optional)
    test_case = "The supplier failed to deliver the goods on time as per the contract."
    result = predict_case(test_case)
    print("Prediction for the test case:", result)


Model Performance:
               precision    recall  f1-score   support

     affirmed       0.50      0.09      0.16        32
      applied       0.35      0.09      0.14       515
     approved       0.00      0.00      0.00        19
        cited       0.55      0.91      0.69      2457
   considered       0.31      0.06      0.10       324
    discussed       0.34      0.06      0.10       205
distinguished       0.62      0.04      0.08       122
     followed       0.43      0.12      0.19       436
  referred to       0.47      0.31      0.38       859
      related       1.00      0.04      0.07        28

     accuracy                           0.53      4997
    macro avg       0.46      0.17      0.19      4997
 weighted avg       0.48      0.53      0.45      4997

Accuracy: 0.528717230338203
New best model saved with accuracy: 0.528717230338203
Prediction for the test case: cited


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import os
import xgboost as xgb

# File to store the best accuracy
BEST_ACCURACY_FILE = "best_accuracy.txt"

# Load Dataset
def load_dataset(csv_file):
    """Load legal case data from a CSV file."""
    df = pd.read_csv(csv_file)
    return df

# Preprocess Data
def preprocess_data(df):
    """Preprocess the dataset for training."""
    # Remove rows with missing case_text or fill missing case_text with an empty string
    df['case_text'] = df['case_text'].fillna('')  # Fills NaN with an empty string
    
    # Text of the case
    X = df['case_text']  
    
    # Labels
    y = df['case_outcome']
    
    # Encode labels to numeric values
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)  # Encode string labels to numeric
    
    return X, y, label_encoder

# Define Model
def create_model():
    """Create and define the XGBoost model."""
    model = xgb.XGBClassifier(
        n_estimators=1000, 
        learning_rate=0.05, 
        max_depth=6, 
        random_state=42, 
        use_label_encoder=False,  # Prevent warning in newer versions of XGBoost
        eval_metric='mlogloss'  # Avoid warnings related to evaluation metric
    )
    return model

# Save the Best Model
def save_best_model(model, vectorizer, label_encoder, accuracy):
    """Save the model, vectorizer, and label encoder if accuracy is higher than the previous best."""
    # Check if the best accuracy file exists
    if os.path.exists(BEST_ACCURACY_FILE):
        with open(BEST_ACCURACY_FILE, "r") as f:
            best_accuracy = float(f.read().strip())
    else:
        best_accuracy = 0.0

    # Compare and save if current accuracy is better
    if accuracy > best_accuracy:
        joblib.dump(model, 'legal_case_classifier_xgb.pkl')
        joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
        joblib.dump(label_encoder, 'label_encoder.pkl')
        with open(BEST_ACCURACY_FILE, "w") as f:
            f.write(str(accuracy))
        print(f"New best model saved with accuracy: {accuracy}")
    else:
        print(f"Model not saved. Current accuracy: {accuracy}, Best accuracy: {best_accuracy}")

# Train Model
def train_model(X, y):
    """Train a classification model using XGBoost."""
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Initialize and train the XGBoost model
    model = create_model()
    model.fit(X_train_tfidf, y_train)
    
    # Evaluate Model
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Performance:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy)
    
    return model, vectorizer, accuracy

# Load Pre-trained Model and Make Predictions
def predict_case(case_text):
    """Predict judgment based on case text using the pre-trained XGBoost model."""
    model = joblib.load('legal_case_classifier_xgb.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    label_encoder = joblib.load('label_encoder.pkl')
    
    # Transform the input case text into TF-IDF features
    case_tfidf = vectorizer.transform([case_text])
    
    # Predict and decode the result
    prediction = model.predict(case_tfidf)
    predicted_label = label_encoder.inverse_transform(prediction)
    return predicted_label[0]

# Main
if __name__ == "__main__":
    # Load the dataset
    dataset = load_dataset("legal_text_classification1.csv")  # Replace with your dataset file
    
    # Preprocess the data and get label encoder
    X, y, label_encoder = preprocess_data(dataset)
    
    # Train the model
    model, vectorizer, accuracy = train_model(X, y)
    
    # Save the best model
    save_best_model(model, vectorizer, label_encoder, accuracy)
    
    # Test prediction (optional)
    test_case = "The supplier failed to deliver the goods on time as per the contract."
    result = predict_case(test_case)
    print("Prediction for the test case:", result)


Parameters: { "use_label_encoder" } are not used.



Model Performance:
              precision    recall  f1-score   support

           0       0.60      0.28      0.38        32
           1       0.46      0.17      0.24       515
           2       0.00      0.00      0.00        19
           3       0.58      0.92      0.71      2457
           4       0.43      0.16      0.23       324
           5       0.35      0.11      0.16       205
           6       0.68      0.33      0.44       122
           7       0.57      0.22      0.32       436
           8       0.61      0.36      0.45       859
           9       0.60      0.11      0.18        28

    accuracy                           0.57      4997
   macro avg       0.49      0.27      0.31      4997
weighted avg       0.55      0.57      0.52      4997

Accuracy: 0.573944366619972
New best model saved with accuracy: 0.573944366619972
Prediction for the test case: cited


In [2]:
!pip install --upgrade scikit-learn
!pip install --upgrade xgboost




In [14]:
!pip install --upgrade xgboost




In [5]:
!pip install xgboost

