In [4]:
from datasets import load_dataset

ds = load_dataset("ailsntua/QEvasion")

In [13]:
from datasets import load_dataset

ds = load_dataset("ailsntua/QEvasion")
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import joblib
import re

class TextClassifier:
    def __init__(self, task='clarity'):
        """
        Initialize classifier for response clarity/evasion classification

        Args:
            task: 'clarity' for 3-class (Clear Reply, Ambivalent Reply, Clear Non-Reply)
                  'evasion' for fine-grained evasion types
        """
        self.vectorizer = None
        self.model = None
        self.task = task

    def preprocess_text(self, text):
        """Basic text preprocessing"""
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text

    def load_data(self, filepath, question_col='question', answer_col='answer',
                  label_col='clarity_label', combine_qa=True, show_evasion_stats=False):
        """
        Load political interview Q&A dataset

        Args:
            filepath: Path to CSV file
            question_col: Name of question column
            answer_col: Name of answer column
            label_col: Name of label column (clarity_label or evasion_label)
            combine_qa: If True, concatenate question and answer as input
            show_evasion_stats: If True, show evasion label distribution if available
        """
        print(f"Reading CSV file: {filepath}")
        try:
            df = pd.read_csv(filepath)
            print(f"✓ CSV loaded successfully: {len(df)} rows, {len(df.columns)} columns")
        except Exception as e:
            print(f"✗ Error reading CSV: {e}")
            raise

        # Check if required columns exist
        print(f"\nChecking for required columns...")
        print(f"Available columns: {list(df.columns)}")

        missing_cols = []
        if question_col not in df.columns:
            missing_cols.append(question_col)
        if answer_col not in df.columns:
            missing_cols.append(answer_col)
        if label_col not in df.columns:
            missing_cols.append(label_col)

        if missing_cols:
            raise KeyError(f"Missing columns: {missing_cols}")

        print(f"✓ All required columns found")

        # Remove rows with missing values in key columns
        print(f"\nCleaning data...")
        original_len = len(df)
        df = df.dropna(subset=[question_col, answer_col, label_col])
        if len(df) < original_len:
            print(f"⚠ Removed {original_len - len(df)} rows with missing values")

        print(f"Processing text data...")
        try:
            if combine_qa:
                # Combine question and answer for better context
                texts = df.apply(
                    lambda row: f"Question: {str(row[question_col])} Answer: {str(row[answer_col])}",
                    axis=1
                )
            else:
                # Use only answer
                texts = df[answer_col].astype(str)

            # Apply preprocessing
            print(f"Preprocessing text...")
            texts = texts.apply(self.preprocess_text)
            print(f"✓ Text processing completed")

        except Exception as e:
            print(f"✗ Error processing text: {e}")
            raise

        labels = df[label_col]

        print(f"\n{'='*60}")
        print(f"DATASET SUMMARY")
        print(f"{'='*60}")
        print(f"Total samples: {len(texts)}")
        print(f"\n{label_col.upper()} Distribution:")
        print(labels.value_counts().to_string())

        # Show evasion stats if requested and column exists
        if show_evasion_stats:
            try:
                # Try to find evasion-related columns
                evasion_cols = [col for col in df.columns if 'evasion' in col.lower()]

                if evasion_cols:
                    print("\n" + "="*60)
                    print("EVASION LABEL STATISTICS")
                    print("="*60)

                    for evasion_col in evasion_cols:
                        print(f"\n{evasion_col.upper()} Distribution:")
                        evasion_counts = df[evasion_col].value_counts()
                        print(evasion_counts.to_string())

                        # Show cross-tabulation with clarity labels if available
                        if label_col != evasion_col and len(df[evasion_col].unique()) <= 20:
                            print(f"\nCross-tabulation: {label_col} vs {evasion_col}")
                            try:
                                crosstab = pd.crosstab(df[label_col], df[evasion_col], margins=True)
                                print(crosstab.to_string())
                            except Exception as e:
                                print(f"⚠ Could not create cross-tabulation: {e}")
                else:
                    print("\n⚠ No evasion columns found in dataset")
            except Exception as e:
                print(f"\n⚠ Error showing evasion stats: {e}")

        return texts, labels

    def train(self, X_train, y_train, tune_hyperparameters=True):
        """Train the TF-IDF + LogReg model"""

        print("\nInitializing TF-IDF Vectorizer...")
        # Initialize TF-IDF Vectorizer - optimized for political text
        self.vectorizer = TfidfVectorizer(
            max_features=10000,  # Political language can be diverse
            ngram_range=(1, 3),  # Unigrams, bigrams, and trigrams for phrases
            min_df=2,  # Ignore rare terms
            max_df=0.9,  # Keep common political terms
            stop_words='english',
            sublinear_tf=True  # Use log scaling for term frequency
        )

        # Transform training data
        print("Transforming text to TF-IDF features...")
        try:
            X_train_tfidf = self.vectorizer.fit_transform(X_train)
            print(f"✓ TF-IDF matrix shape: {X_train_tfidf.shape}")
            print(f"  - {X_train_tfidf.shape[0]} samples")
            print(f"  - {X_train_tfidf.shape[1]} features")
        except Exception as e:
            print(f"✗ Error during vectorization: {e}")
            raise

        if tune_hyperparameters:
            # Hyperparameter tuning with GridSearch
            param_grid = {
                'C': [0.1, 1, 10, 100],  # Wider range for better regularization
                'penalty': ['l2'],
                'solver': ['lbfgs'],
                'max_iter': [500],
                'class_weight': [None, 'balanced']  # Handle class imbalance
            }

            print("\nPerforming hyperparameter tuning...")
            print("This may take a few minutes...")
            try:
                grid_search = GridSearchCV(
                    LogisticRegression(random_state=42),
                    param_grid,
                    cv=5,
                    scoring='f1_weighted',  # Better for imbalanced classes
                    n_jobs=-1,
                    verbose=2
                )
                grid_search.fit(X_train_tfidf, y_train)

                self.model = grid_search.best_estimator_
                print(f"\n✓ Hyperparameter tuning completed!")
                print(f"Best parameters: {grid_search.best_params_}")
                print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")
            except Exception as e:
                print(f"✗ Error during hyperparameter tuning: {e}")
                print("Falling back to default parameters...")
                tune_hyperparameters = False

        if not tune_hyperparameters:
            # Train with default parameters
            print("\nTraining with default parameters...")
            try:
                self.model = LogisticRegression(
                    random_state=42,
                    max_iter=500,
                    C=1.0,
                    class_weight='balanced',  # Handle imbalanced data
                    verbose=1
                )
                self.model.fit(X_train_tfidf, y_train)
                print("✓ Model training completed!")
            except Exception as e:
                print(f"✗ Error during model training: {e}")
                raise

        return self

    def evaluate(self, X_test, y_test):
        """Evaluate model performance"""
        X_test_tfidf = self.vectorizer.transform(X_test)
        y_pred = self.model.predict(X_test_tfidf)
        y_proba = self.model.predict_proba(X_test_tfidf)

        # Calculate average confidence (max probability for each prediction)
        avg_confidence = np.max(y_proba, axis=1).mean()

        print("\n=== Model Evaluation ===")
        print(f"Accuracy:       {accuracy_score(y_test, y_pred):.4f}")
        print(f"Weighted F1:    {f1_score(y_test, y_pred, average='weighted'):.4f}")
        print(f"Macro F1:       {f1_score(y_test, y_pred, average='macro'):.4f}")
        print(f"Avg Confidence: {avg_confidence:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

        return y_pred

    def get_top_features(self, n=20):
        """Get most important features for each class"""
        if self.model is None or self.vectorizer is None:
            print("Model not trained yet!")
            return

        feature_names = self.vectorizer.get_feature_names_out()

        print("\n=== Top Predictive Features per Class ===")
        for idx, class_name in enumerate(self.model.classes_):
            coef = self.model.coef_[idx]
            top_indices = np.argsort(coef)[-n:][::-1]
            top_features = [(feature_names[i], coef[i]) for i in top_indices]

            print(f"\n{class_name}:")
            for feat, score in top_features:
                print(f"  {feat}: {score:.4f}")

    def predict(self, texts):
        """Predict labels for new texts"""
        if isinstance(texts, str):
            texts = [texts]

        processed_texts = [self.preprocess_text(text) for text in texts]
        X_tfidf = self.vectorizer.transform(processed_texts)
        predictions = self.model.predict(X_tfidf)
        probabilities = self.model.predict_proba(X_tfidf)

        return predictions, probabilities

    def save_model(self, vectorizer_path='tfidf_vectorizer.pkl',
                   model_path='logreg_model.pkl'):
        """Save trained model and vectorizer"""
        joblib.dump(self.vectorizer, vectorizer_path)
        joblib.dump(self.model, model_path)
        print(f"Model saved to {model_path}")
        print(f"Vectorizer saved to {vectorizer_path}")

    def load_model(self, vectorizer_path='tfidf_vectorizer.pkl',
                   model_path='logreg_model.pkl'):
        """Load pre-trained model and vectorizer"""
        self.vectorizer = joblib.load(vectorizer_path)
        self.model = joblib.load(model_path)
        print("Model loaded successfully!")


# Training script for "I Never Said That" dataset from Hugging Face
if __name__ == "__main__":
    print("="*60)
    print("Training Response Clarity Classifier")
    print("="*60)

    # QUICK DIAGNOSTICS - Check your environment
    print("\n[DIAGNOSTICS]")
    print(f"Python packages:")
    print(f"  pandas: {pd.__version__}")
    print(f"  numpy: {np.__version__}")
    print(f"  sklearn: {__import__('sklearn').__version__}")

    try:
        from datasets import load_dataset
        print(f"  datasets: {__import__('datasets').__version__}")
    except ImportError:
        print("\n✗ 'datasets' library not found!")
        print("Installing datasets library...")
        import subprocess
        subprocess.check_call(['pip', 'install', 'datasets'])
        from datasets import load_dataset
        print("✓ datasets library installed successfully!")

    import sys
    print(f"\nMemory available: Checking...")
    try:
        import psutil
        mem = psutil.virtual_memory()
        print(f"  Total RAM: {mem.total / (1024**3):.1f} GB")
        print(f"  Available RAM: {mem.available / (1024**3):.1f} GB")
    except:
        print("  (psutil not available - can't check memory)")

    # Initialize classifiers (one for clarity, one for evasion)
    clarity_classifier = TextClassifier(task='clarity')
    evasion_classifier = TextClassifier(task='evasion')

    # STEP 1: Load dataset from Hugging Face
    print(f"\n{'='*60}")
    print(f"STEP 1: LOADING DATASET FROM HUGGING FACE")
    print(f"{'='*60}")

    try:
        print("\nDownloading dataset from Hugging Face: ailsntua/QEvasion")
        print("This may take a moment on first run...")
        ds = load_dataset("ailsntua/QEvasion")

        print(f"✓ Dataset loaded successfully!")
        print(f"\nAvailable splits: {list(ds.keys())}")

        # Convert to pandas DataFrame for easier processing
        if 'train' in ds:
            df = ds['train'].to_pandas()
        else:
            split_name = list(ds.keys())[0]
            print(f"Using split: {split_name}")
            df = ds[split_name].to_pandas()

        print(f"\nDataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"Columns: {list(df.columns)}")

        # Check for required columns
        required_cols = ['question', 'interview_answer', 'clarity_label']
        missing = [col for col in required_cols if col not in df.columns]

        if missing:
            print(f"\n⚠ Warning: Expected columns not found: {missing}")
            print("Available columns:", list(df.columns))
            print("\nPlease check the dataset structure and update column names in the code.")
            exit()

        # Remove rows with missing values
        print(f"\nCleaning data...")
        original_len = len(df)
        df = df.dropna(subset=['question', 'interview_answer', 'clarity_label'])
        if len(df) < original_len:
            print(f"⚠ Removed {original_len - len(df)} rows with missing values")

        # Show dataset statistics
        print(f"\n{'='*60}")
        print(f"FULL DATASET STATISTICS")
        print(f"{'='*60}")
        print(f"Total samples: {len(df)}")
        print(f"\nCLARITY_LABEL Distribution:")
        print(df['clarity_label'].value_counts().to_string())

        # Show evasion stats if column exists
        has_evasion = 'evasion_label' in df.columns
        if has_evasion:
            print(f"\nEVASION_LABEL Distribution:")
            evasion_counts = df['evasion_label'].value_counts()
            print(evasion_counts.to_string())

            # Cross-tabulation
            print(f"\nCross-tabulation: clarity_label vs evasion_label")
            crosstab = pd.crosstab(df['clarity_label'], df['evasion_label'], margins=True)
            print(crosstab.to_string())

    except Exception as e:
        print(f"\n✗ Error loading dataset: {e}")
        import traceback
        traceback.print_exc()
        exit()

    # STEP 2: Split into 90% train, 10% test
    print(f"\n{'='*60}")
    print(f"STEP 2: SPLITTING DATA (90% TRAIN / 10% TEST)")
    print(f"{'='*60}")

    # Prepare texts by combining question and answer
    print("\nPreparing text data...")
    texts = df.apply(
        lambda row: f"Question: {str(row['question'])} Answer: {str(row['interview_answer'])}",
        axis=1
    )

    # Apply preprocessing
    print("Preprocessing text...")
    texts = texts.apply(clarity_classifier.preprocess_text)
    clarity_labels = df['clarity_label']

    # Split with stratification to maintain class distribution
    X_train, X_test, y_clarity_train, y_clarity_test = train_test_split(
        texts,
        clarity_labels,
        test_size=0.10,
        random_state=42,
        stratify=clarity_labels
    )

    # Also split the original dataframe to get evasion labels for test set
    df_train, df_test = train_test_split(
        df,
        test_size=0.10,
        random_state=42,
        stratify=df['clarity_label']
    )

    print(f"✓ Data split completed!")
    print(f"\nTraining set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")

    print(f"\nTraining set clarity label distribution:")
    print(y_clarity_train.value_counts().to_string())

    print(f"\nTest set clarity label distribution:")
    print(y_clarity_test.value_counts().to_string())

    # Prepare evasion labels if available
    if has_evasion:
        y_evasion_train = df_train['evasion_label']
        y_evasion_test = df_test['evasion_label']

        print(f"\n--- TRAINING SET EVASION STATS ---")
        print(y_evasion_train.value_counts().to_string())

        print(f"\n--- TEST SET EVASION STATS ---")
        print(y_evasion_test.value_counts().to_string())

    # STEP 3: Train the clarity model
    print("\n" + "="*60)
    print("STEP 3: TRAINING CLARITY MODEL")
    print("="*60)

    USE_HYPERPARAMETER_TUNING = False

    print(f"\nHyperparameter tuning: {USE_HYPERPARAMETER_TUNING}")
    if not USE_HYPERPARAMETER_TUNING:
        print("(Set to True for better accuracy, but requires more time/memory)")

    try:
        clarity_classifier.train(
            X_train,
            y_clarity_train,
            tune_hyperparameters=USE_HYPERPARAMETER_TUNING
        )
    except Exception as e:
        print(f"\n✗ TRAINING FAILED: {e}")
        print("\nTroubleshooting tips:")
        print("1. Try reducing max_features in the code")
        print("2. Make sure you have enough RAM")
        print("3. Check if your CSV has correct column names")
        import traceback
        traceback.print_exc()
        exit()

    print("\n✓ Clarity model training completed!")

    # STEP 4: Evaluate clarity model on test set
    print("\n" + "="*60)
    print("STEP 4: CLARITY MODEL EVALUATION")
    print("="*60)

    try:
        y_clarity_pred = clarity_classifier.evaluate(X_test, y_clarity_test)
    except Exception as e:
        print(f"\n✗ Evaluation failed: {e}")
        import traceback
        traceback.print_exc()

    # STEP 5: Train and evaluate evasion model (if evasion labels exist)
    if has_evasion:
        print("\n" + "="*60)
        print("STEP 5: TRAINING EVASION MODEL")
        print("="*60)

        try:
            evasion_classifier.train(
                X_train,
                y_evasion_train,
                tune_hyperparameters=USE_HYPERPARAMETER_TUNING
            )
            print("\n✓ Evasion model training completed!")
        except Exception as e:
            print(f"\n✗ Evasion training failed: {e}")
            import traceback
            traceback.print_exc()
            has_evasion = False

        if has_evasion:
            print("\n" + "="*60)
            print("STEP 6: EVASION MODEL EVALUATION")
            print("="*60)

            try:
                y_evasion_pred = evasion_classifier.evaluate(X_test, y_evasion_test)
            except Exception as e:
                print(f"\n✗ Evasion evaluation failed: {e}")
                import traceback
                traceback.print_exc()

    # STEP 7: Analyze most predictive features
    print("\n" + "="*60)
    print("FEATURE ANALYSIS - CLARITY MODEL")
    print("="*60)
    clarity_classifier.get_top_features(n=15)

    if has_evasion:
        print("\n" + "="*60)
        print("FEATURE ANALYSIS - EVASION MODEL")
        print("="*60)
        evasion_classifier.get_top_features(n=15)

    # STEP 8: Save the trained models
    print("\n" + "="*60)
    print("SAVING MODELS")
    print("="*60)

    clarity_classifier.save_model(
        vectorizer_path='clarity_tfidf_vectorizer.pkl',
        model_path='clarity_logreg_model.pkl'
    )

    if has_evasion:
        evasion_classifier.save_model(
            vectorizer_path='evasion_tfidf_vectorizer.pkl',
            model_path='evasion_logreg_model.pkl'
        )

    # STEP 9: Test on some examples from test set
    print("\n" + "="*60)
    print("SAMPLE PREDICTIONS FROM TEST SET")
    print("="*60)

    # Show 5 random examples from test set
    sample_indices = np.random.choice(len(X_test), min(5, len(X_test)), replace=False)

    for i, idx in enumerate(sample_indices):
        text = X_test.iloc[idx]
        true_clarity = y_clarity_test.iloc[idx]
        pred_clarity, prob_clarity = clarity_classifier.predict([text])

        print(f"\n{'='*50}")
        print(f"Example {i + 1}:")
        print(f"{'='*50}")
        print(f"Text: {text[:300]}...")
        print(f"\nTrue Clarity Label: {true_clarity}")
        print(f"Predicted Clarity: {pred_clarity[0]}")
        print(f"Clarity Confidence: {max(prob_clarity[0]):.2%}")

        if has_evasion:
            true_evasion = y_evasion_test.iloc[idx]
            pred_evasion, prob_evasion = evasion_classifier.predict([text])
            print(f"\nTrue Evasion Label: {true_evasion}")
            print(f"Predicted Evasion: {pred_evasion[0]}")
            print(f"Evasion Confidence: {max(prob_evasion[0]):.2%}")

        print(f"\nClarity Correct: {'✓' if pred_clarity[0] == true_clarity else '✗'}")
        if has_evasion:
            print(f"Evasion Correct: {'✓' if pred_evasion[0] == true_evasion else '✗'}")

    print("\n" + "="*60)
    print("TRAINING COMPLETE!")
    print("="*60)
    print("\nModels saved:")
    print("  - clarity_tfidf_vectorizer.pkl")
    print("  - clarity_logreg_model.pkl")
    if has_evasion:
        print("  - evasion_tfidf_vectorizer.pkl")
        print("  - evasion_logreg_model.pkl")
    print("\nYou can now use the trained models to predict on new data!")

Training Response Clarity Classifier

[DIAGNOSTICS]
Python packages:
  pandas: 2.2.2
  numpy: 2.0.2
  sklearn: 1.6.1
  datasets: 4.0.0

Memory available: Checking...
  Total RAM: 12.7 GB
  Available RAM: 10.5 GB

STEP 1: LOADING DATASET FROM HUGGING FACE

Downloading dataset from Hugging Face: ailsntua/QEvasion
This may take a moment on first run...
✓ Dataset loaded successfully!

Available splits: ['train', 'test']

Dataset shape: 3448 rows, 20 columns
Columns: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']

Cleaning data...

FULL DATASET STATISTICS
Total samples: 3448

CLARITY_LABEL Distribution:
clarity_label
Ambivalent         2040
Clear Reply        1052
Clear Non-Reply     356

EVASION_LABEL Distribution:
evasion_label
Ex

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished


✓ Model training completed!

✓ Clarity model training completed!

STEP 4: CLARITY MODEL EVALUATION

=== Model Evaluation ===
Accuracy:       0.6232
Weighted F1:    0.6276
Macro F1:       0.5762
Avg Confidence: 0.5462

Classification Report:
                 precision    recall  f1-score   support

     Ambivalent       0.73      0.68      0.70       204
Clear Non-Reply       0.42      0.61      0.50        36
    Clear Reply       0.53      0.52      0.53       105

       accuracy                           0.62       345
      macro avg       0.56      0.60      0.58       345
   weighted avg       0.64      0.62      0.63       345


Confusion Matrix:
[[138  22  44]
 [  9  22   5]
 [ 42   8  55]]

STEP 5: TRAINING EVASION MODEL

Initializing TF-IDF Vectorizer...
Transforming text to TF-IDF features...
✓ TF-IDF matrix shape: (3103, 10000)
  - 3103 samples
  - 10000 features

Training with default parameters...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s finished


✓ Model training completed!

✓ Evasion model training completed!

STEP 6: EVASION MODEL EVALUATION

=== Model Evaluation ===
Accuracy:       0.3275
Weighted F1:    0.3304
Macro F1:       0.3380
Avg Confidence: 0.2593

Classification Report:
                     precision    recall  f1-score   support

   Claims ignorance       0.44      0.50      0.47        14
      Clarification       0.47      0.78      0.58         9
Declining to answer       0.37      0.77      0.50        13
         Deflection       0.25      0.35      0.29        43
            Dodging       0.43      0.32      0.37        65
           Explicit       0.54      0.29      0.37       105
            General       0.22      0.28      0.24        39
           Implicit       0.20      0.24      0.22        49
Partial/half-answer       0.00      0.00      0.00         8

           accuracy                           0.33       345
          macro avg       0.32      0.39      0.34       345
       weighted avg      