In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from typing import Tuple, Any, Dict

class MultiAlgorithmClassifier:
    def __init__(self, algorithm: str = 'rf', max_features: int = 1500):
        """
        Initialize the classifier with choice of algorithm.

        Args:
            algorithm (str): 'rf' for Random Forest, 'svm' for Support Vector Machine,
                           'lr' for Logistic Regression
            max_features (int): Maximum features for TF-IDF
        """
        self.algorithm = algorithm

        # Configure the classifier based on algorithm choice
        if algorithm == 'rf':
            classifier = RandomForestClassifier(
                n_estimators=200,
                max_depth=20,
                min_samples_split=5,
                min_samples_leaf=2,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1  # Use all available cores
            )
        elif algorithm == 'svm':
            classifier = SVC(
                kernel='rbf',
                C=10.0,
                gamma='scale',
                class_weight='balanced',
                random_state=42,
                probability=True
            )
        else:  # Default to logistic regression
            classifier = LogisticRegression(
                random_state=42,
                max_iter=1000,
                class_weight='balanced',
                C=1.0,
                solver='liblinear'
            )

        self.pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer(
                max_features=max_features,
                ngram_range=(1, 3),
                stop_words='english',
                min_df=2,
                max_df=0.95
            )),
            ('classifier', classifier)
        ])

    def prepare_data(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare the data for training/testing."""
        X = df['query'].str.lower().values
        y = df['is_legal'].values
        return X, y

    def train_model(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        """Train the model."""
        self.pipeline.fit(X_train, y_train)

    def predict(self, queries: list) -> Tuple[np.ndarray, np.ndarray]:
        """Make predictions with probability scores."""
        queries = [q.lower() for q in queries]
        predictions = self.pipeline.predict(queries)
        probabilities = self.pipeline.predict_proba(queries)
        return predictions, probabilities

def compare_algorithms(df: pd.DataFrame, test_size: float = 0.2) -> Dict:
    """
    Compare different algorithms on the same data.

    Args:
        df (pd.DataFrame): Input DataFrame with queries and labels
        test_size (float): Proportion of data for testing

    Returns:
        Dict: Performance metrics for each algorithm
    """
    # Prepare data once
    X = df['query'].str.lower().values
    y = df['is_legal'].values

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    # Test each algorithm
    algorithms = ['rf', 'svm', 'lr']
    results = {}

    for algo in algorithms:
        print(f"\nTraining {algo.upper()} classifier...")
        classifier = MultiAlgorithmClassifier(algorithm=algo)
        classifier.train_model(X_train, y_train)
        predictions, probabilities = classifier.predict(X_test)

        results[algo] = {
            'accuracy': accuracy_score(y_test, predictions),
            'classification_report': classification_report(y_test, predictions),
            'classifier': classifier
        }

    return results

# Example usage
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('generated_legal_queries.csv')

    # Compare algorithms
    results = compare_algorithms(df)

    # Print results
    for algo, metrics in results.items():
        print(f"\n{algo.upper()} Classifier Results:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print("\nClassification Report:")
        print(metrics['classification_report'])



Training RF classifier...

Training SVM classifier...

Training LR classifier...

RF Classifier Results:
Accuracy: 0.6400

Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.67      0.75       120
        True       0.29      0.53      0.37        30

    accuracy                           0.64       150
   macro avg       0.57      0.60      0.56       150
weighted avg       0.74      0.64      0.67       150


SVM Classifier Results:
Accuracy: 0.7467

Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.82      0.84       120
        True       0.38      0.43      0.41        30

    accuracy                           0.75       150
   macro avg       0.62      0.63      0.62       150
weighted avg       0.76      0.75      0.75       150


LR Classifier Results:
Accuracy: 0.6733

Classification Report:
              precision    recall  f1-score   support

       False 

In [9]:
    example_queries = [
        "I just crashed my vehicle what is next legal procedure?",
        "What's the best pizza in Chicago?",
        "Can my landlord evict me without notice?",
        "I want to eat Paneer kadhai for dinner"
    ]

    # Use Random Forest for predictions (or choose the best performing model)
    best_classifier = results['rf']['classifier']  # You can change this based on results
    predictions, probabilities = best_classifier.predict(example_queries)

    print("\nPredictions using Random Forest:")
    for query, prediction, prob in zip(example_queries, predictions, probabilities):
        confidence = prob[1] if prediction else prob[0]
        print(f"\nQuery: {query}")
        print(f"Is Legal: {prediction}")
        print(f"Confidence: {confidence:.4f}")


Predictions using Random Forest:

Query: I just crashed my vehicle what is next legal procedure?
Is Legal: True
Confidence: 0.5193

Query: What's the best pizza in Chicago?
Is Legal: False
Confidence: 0.8766

Query: Can my landlord evict me without notice?
Is Legal: True
Confidence: 0.5359

Query: I want to eat Paneer kadhai for dinner
Is Legal: False
Confidence: 0.5856
