In [10]:
from openai import OpenAI
import pandas as pd
import requests
import time
import numpy as np
from typing import List, Dict
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm  # Import tqdm for Jupyter notebooks

# Configuration
OPENAI_API_KEY = "sk-proj-B1Ga5gcREhY_2XOigGLWUb3GwA"

NUM_ITERATIONS = 15
QUERIES_PER_ITERATION = 50

client = OpenAI(
    api_key=OPENAI_API_KEY,
)

# Load test cases from CSV
test_df = pd.read_csv('ILAO Generated Semantic Search Tests - Sheet1.csv')

def generate_test_queries(n: int = 15) -> List[str]:
    """Generate test queries using GPT-4."""
    prompt = """Generate {n} unique legal aid queries that Illinois residents might ask to a Illinios Legal Aid Online (ILAO) Chatbot. Include:
    - Simple questions about rights (housing, employment, etc.)
    - Complex multi-part questions
    - Questions with misspellings
    - Questions mixing legal and non-legal terms
    - Make queries long , short and medium in length
    - Some completely non-legal questions as negative test cases
    
    Format: One query per line, prefixed with category in brackets:
    [Simple] How do I file for divorce?
    [Complex] What happens if my landlord...
    [Negative] Where is the best pizza...
    Other Categories could be Complex_Negative, Complex_Positive, Technical Legal, Mixed Language etc
    Add one more category as Not Query where the query should be some non related things, non-english text, rubbish queries, queries that dont make sense
    Generate Queries with all this categories and generate some category of your own
    """.format(n=n)
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a query generator for testing a legal aid search system."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.8
    )
    
    queries = []
    for line in response.choices[0].message.content.strip().split('\n'):
        if line:
            try:
                category = line[line.find("[")+1:line.find("]")]
                query = line[line.find("]")+1:].strip()
                queries.append({
                    'query': query,
                    'category': category
                })
            except:
                continue
    return queries[:n]

In [11]:
test_queries = generate_test_queries(n=188)

# Convert the queries to a DataFrame
queries_df = pd.DataFrame(test_queries)

# Save to CSV
output_file = "test_queries.csv"
queries_df.to_csv(output_file, index=False)

In [15]:
df = pd.read_csv("test_queries.csv")
df.head(100)

Unnamed: 0,query,category
0,Here are 500 unique queries covering various c...,Here are 500 unique queries covering various c...
1,What are my rights as a tenant in Illinois?,Simple
2,If my employer fired me for taking family leav...,Complex
3,How do I bake a cake from scratch?,Negative
4,If I have a dispute with my neighbor over a fe...,Complex_Negative
...,...,...
95,"If I want to start a nonprofit, what legal req...",Complex_Positive
96,Can I get fired for reporting unsafe working c...,Simple
97,What is the process for getting a copy of my c...,Technical Legal
98,¿Necesito un abogado para divorciarme?,Mixed Language


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Load and examine the data
df = pd.read_csv("test_queries.csv")

# Print category distribution
print("Category Distribution:")
print(df['category'].value_counts())
print("\nTotal number of categories:", len(df['category'].unique()))
print("Total number of samples:", len(df))

# Remove categories with only one example
category_counts = df['category'].value_counts()
valid_categories = category_counts[category_counts >= 2].index
df_filtered = df[df['category'].isin(valid_categories)]

print("\nAfter filtering rare categories:")
print("Remaining categories:", len(df_filtered['category'].unique()))
print("Remaining samples:", len(df_filtered))

# 1. Data Preparation
X = df_filtered['query']
y = df_filtered['category']

# Split the data without stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
    # Removed stratify parameter
)

# 2. Create the pipeline with adjusted parameters for small dataset
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=1000,  # Reduced from 5000 due to small dataset
        ngram_range=(1, 2),
        stop_words='english',
        min_df=1  # Allow terms that appear in at least 1 document
    )),
    ('classifier', LogisticRegression(
        multi_class='multinomial',
        max_iter=2000,
        random_state=42,
        C=1.0,  # Regularization strength
        class_weight='balanced'  # Handle class imbalance
    ))
])

# 3. Train the model
pipeline.fit(X_train, y_train)

# 4. Make predictions
train_predictions = pipeline.predict(X_train)
test_predictions = pipeline.predict(X_test)

# 5. Print model performance
print("\nTraining Set Performance:")
print("-" * 50)
print(classification_report(y_train, train_predictions))
print(f"Training Accuracy: {accuracy_score(y_train, train_predictions):.2f}")

print("\nTest Set Performance:")
print("-" * 50)
print(classification_report(y_test, test_predictions))
print(f"Test Accuracy: {accuracy_score(y_test, test_predictions):.2f}")

Category Distribution:
category
Simple                                                                  40
Complex                                                                 27
Negative                                                                27
Technical Legal                                                         27
Mixed Language                                                          26
Complex_Negative                                                        14
Misspelling                                                             13
Complex_Positive                                                        13
Here are 500 unique queries covering various categories as requested     1
Name: count, dtype: int64

Total number of categories: 9
Total number of samples: 188

After filtering rare categories:
Remaining categories: 8
Remaining samples: 187

Training Set Performance:
--------------------------------------------------
                  precision    recall  f1-score 



In [16]:
new_queries = ["What are my rights as a tenant in Illinois?	"]
predictions = predict_category(new_queries)
print("\nPredictions for new queries:", predictions)

# 7. Save the model (optional)
import joblib
joblib.dump(pipeline, 'text_classifier_model.joblib')


Predictions for new queries: ['Simple']


['text_classifier_model.joblib']