In [1]:
!pip install textdescriptives



In [21]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler # Added StandardScaler
from sklearn.decomposition import PCA # Added PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import ast # Used for safely evaluating string representations of dicts
import numpy as np # Import numpy to handle potential NaNs after parsing
import matplotlib.pyplot as plt # Added for plotting
import seaborn as sns # Added for heatmap

# --- Configuration ---
# Specify the path to the final processed CSV file (output of evaluate_prompts.py)
PROCESSED_FILE_PATH = 'textdescriptives_processed_prompt_examples_dataset.csv'
TARGET_COLUMN = 'prompt_qual'
TEST_SIZE = 0.2 # Proportion of data to use for testing
RANDOM_STATE = 42 # For reproducibility
PCA_N_COMPONENTS = None # Set to an int (e.g., 10) or float (e.g., 0.95 for variance) or None to keep all

# --- Load Data ---
print(f"Loading data from: {PROCESSED_FILE_PATH}")
try:
    # Explicitly tell pandas to keep 'nan' as a string for now, not interpret it as NaN yet
    df = pd.read_csv(PROCESSED_FILE_PATH, keep_default_na=False, na_values=[''])
    print("Data loaded successfully.")
    print("Initial DataFrame info:")
    df.info()
    print("\nFirst 5 rows:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {PROCESSED_FILE_PATH}")
    print("Please ensure 'extract_data.py' and 'evaluate_prompts.py' have been run successfully.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    exit()

# --- Feature Engineering & Selection ---
print("\nStarting feature engineering...")

# Define potential feature columns generated by textdescriptives
potential_feature_cols = [
    "readability", "token_length", "sentence_length",
    "coherence", "information_theory", "entropy",
    "perplexity", "per_word_perplexity"
]

# Identify columns that actually exist in the DataFrame
existing_feature_cols = [col for col in potential_feature_cols if col in df.columns]
print(f"Found potential feature columns: {existing_feature_cols}")

# Flatten dictionary-like columns
features_list = []
original_df_index = df.index # Keep track of original index for joining later

for col in existing_feature_cols:
    # Ensure we're working with a clean copy and avoid SettingWithCopyWarning
    col_data = df[col].copy()

    # Check if the column contains string representations of dictionaries
    # Use dropna().iloc[0] to safely get the first non-NA value for type checking
    first_valid_value = col_data.dropna().iloc[0] if not col_data.dropna().empty else None

    if isinstance(first_valid_value, str) and first_valid_value.strip().startswith('{'):
        print(f"Processing string-dict column: {col}")
        try:
            # --- FIX: Replace 'nan' with 'None' before parsing ---
            processed_col_series = col_data.fillna('').astype(str).str.replace(r'\bnan\b', 'None', regex=True)

            # Define a helper function to safely apply literal_eval
            def safe_literal_eval(item):
                try:
                    if not item or item == 'None': return None
                    return ast.literal_eval(item)
                except (ValueError, SyntaxError):
                    # print(f"Warning: Could not parse item in column '{col}': {item}. Returning None.") # Can be verbose
                    return None
                except Exception as e:
                    # print(f"Warning: Unexpected error parsing item in column '{col}': {item}. Error: {e}. Returning None.") # Can be verbose
                    return None

            # Apply the safe evaluation function
            expanded_col = processed_col_series.apply(safe_literal_eval)

            # Normalize the dictionary into separate columns
            normalized_df = pd.json_normalize(expanded_col[expanded_col.notna()])
            normalized_df = normalized_df.reindex(original_df_index)
            normalized_df.columns = [f"{col}_{sub_col}" for sub_col in normalized_df.columns]
            features_list.append(normalized_df)
            print(f"Successfully flattened string-dict column: {col}")

        except Exception as e:
             print(f"Warning: An unexpected error occurred processing column '{col}': {e}. Skipping.")

    # Check if the column contains actual dictionary objects
    elif isinstance(first_valid_value, dict):
        print(f"Processing dict column: {col}")
        try:
            normalized_df = pd.json_normalize(col_data)
            normalized_df = normalized_df.reindex(original_df_index)
            normalized_df.columns = [f"{col}_{sub_col}" for sub_col in normalized_df.columns]
            features_list.append(normalized_df)
            print(f"Successfully flattened dict column: {col}")
        except Exception as e:
             print(f"Warning: An unexpected error occurred processing dict column '{col}': {e}. Skipping.")

    # Assume it's a simple numeric column
    elif pd.api.types.is_numeric_dtype(col_data):
        print(f"Using numeric column directly: {col}")
        features_list.append(col_data.to_frame().reindex(original_df_index))
    else:
        print(f"Warning: Column '{col}' type ({type(first_valid_value)}) is not numeric or a recognized dictionary format. Skipping.")


# Combine all processed features
if not features_list:
    print("Error: No valid features found after processing. Exiting.")
    exit()

X = pd.concat(features_list, axis=1)

# --- Handle Missing Values (Impute with mean) ---
if X.isnull().sum().sum() > 0:
    print("\nWarning: Missing values found in features. Imputing with mean.")
    # print(X.isnull().sum()) # Can be verbose
    numeric_cols = X.select_dtypes(include=np.number).columns
    for col in numeric_cols:
        if X[col].isnull().any():
            mean_val = X[col].mean()
            if pd.isna(mean_val):
                mean_val = 0
                # print(f"Warning: Mean for column {col} is NaN. Imputing with 0.") # Can be verbose
            X[col].fillna(mean_val, inplace=True)
            # print(f"Imputed numeric column: {col} with mean {mean_val:.4f}") # Can be verbose
    non_numeric_cols = X.select_dtypes(exclude=np.number).columns
    if not X[non_numeric_cols].isnull().sum().sum() == 0:
         print("Warning: Non-numeric NaNs detected after processing. Consider specific handling.")
else:
    print("\nNo missing values found in features.")


print("\nFinal features prepared:")
X.info()
print(X.head())

# --- Correlation Analysis ---
print("\n--- Correlation Matrix Analysis ---")
# Ensure only numeric columns are used for correlation
X_numeric = X.select_dtypes(include=np.number)
if X_numeric.shape[1] < X.shape[1]:
    print(f"Warning: Excluded {X.shape[1] - X_numeric.shape[1]} non-numeric columns from correlation analysis.")

if X_numeric.shape[1] > 1: # Need at least 2 numeric columns for correlation
    correlation_matrix = X_numeric.corr()
    print("Correlation Matrix calculated.")

    # Display the matrix (optional, can be large)
    # print(correlation_matrix)

    # Visualize the correlation matrix using a heatmap
    plt.figure(figsize=(12, 10)) # Adjust size as needed
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f") # annot=True can be slow/cluttered for many features
    plt.title('Feature Correlation Matrix')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    # plt.show() # Display the plot interactively
    # Instead of plt.show(), save the figure if running non-interactively
    try:
        corr_matrix_path = "feature_correlation_matrix.png"
        plt.savefig(corr_matrix_path)
        print(f"Correlation matrix heatmap saved to {corr_matrix_path}")
        plt.close() # Close the plot figure
    except Exception as e:
        print(f"Could not save correlation matrix heatmap: {e}")
        plt.close() # Ensure figure is closed even on error
else:
    print("Skipping correlation matrix: Not enough numeric features.")


# --- Principal Component Analysis (PCA) ---
print("\n--- Principal Component Analysis (PCA) ---")
# PCA requires features to be scaled
if X_numeric.shape[1] > 0: # Need at least 1 numeric column for PCA
    print("Scaling numeric features for PCA...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)
    print("Features scaled.")

    print(f"Performing PCA (n_components={PCA_N_COMPONENTS})...")
    pca = PCA(n_components=PCA_N_COMPONENTS, random_state=RANDOM_STATE)
    pca.fit(X_scaled)

    # Explained variance
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    n_components_fitted = pca.n_components_

    print(f"\nPCA finished. Number of components fitted: {n_components_fitted}")
    print("Explained Variance Ratio per Component:")
    for i, ratio in enumerate(explained_variance_ratio):
        print(f"  PC {i+1}: {ratio:.4f}")

    print("\nCumulative Explained Variance Ratio:")
    for i, ratio in enumerate(cumulative_variance_ratio):
        print(f"  Up to PC {i+1}: {ratio:.4f}")

    # Optional: Plot cumulative explained variance
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, n_components_fitted + 1), cumulative_variance_ratio, marker='o', linestyle='--')
    plt.title('Cumulative Explained Variance by PCA Components')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.grid(True)
    plt.tight_layout()
    # plt.show() # Display the plot interactively
    # Save the figure
    try:
        pca_variance_path = "pca_explained_variance.png"
        plt.savefig(pca_variance_path)
        print(f"PCA explained variance plot saved to {pca_variance_path}")
        plt.close()
    except Exception as e:
        print(f"Could not save PCA variance plot: {e}")
        plt.close()

    # Optional: You can transform the data if needed for other purposes
    # X_pca = pca.transform(X_scaled)
    # print(f"\nData transformed to PCA components shape: {X_pca.shape}")

else:
    print("Skipping PCA: No numeric features available.")


# --- Target Variable Preparation ---
print(f"\n--- Preparing Target Variable ---")
if TARGET_COLUMN not in df.columns:
    print(f"Error: Target column '{TARGET_COLUMN}' not found in the DataFrame.")
    exit()

y = df[TARGET_COLUMN]

# Encode target labels ('bad' -> 0, 'good' -> 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Target variable encoded: {list(le.classes_)} -> {list(range(len(le.classes_)))}")
# print(f"Value counts:\n{pd.Series(y_encoded).value_counts()}") # Can be verbose


# --- Data Splitting ---
print(f"\n--- Data Splitting ---")
# Ensure X and y_encoded have the same index before splitting
if not X.index.equals(pd.RangeIndex(start=0, stop=len(y_encoded), step=1)):
     print("Warning: Resetting index on X and y_encoded before splitting to ensure alignment.")
     X = X.reset_index(drop=True)
     if len(X) != len(y_encoded):
         print(f"Error: Length mismatch after processing. X length: {len(X)}, y_encoded length: {len(y_encoded)}. Exiting.")
         exit()


X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, # Use original X for XGBoost
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_encoded
)
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Testing set shape: X={X_test.shape}, y={y_test.shape}")

# --- Model Training ---
print("\n--- Model Training ---")
# Initialize XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=RANDOM_STATE,
    validate_parameters=True
)

# Train the model
xgb_clf.fit(X_train, y_train)
print("Model training complete.")

# --- Model Evaluation ---
print("\n--- Model Evaluation ---")
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Print confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm, index=le.classes_, columns=[f"Predicted {c}" for c in le.classes_]))

# --- Feature Importance (Optional) ---
try:
    print("\n--- Feature Importances (XGBoost) ---")
    importances = pd.DataFrame({
        'Feature': X.columns, # Use columns from original X
        'Importance': xgb_clf.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    with pd.option_context('display.max_rows', None):
        print(importances)
except Exception as e:
    print(f"Could not display feature importances: {e}")

joblib.dump(X.columns, 'feature_columns.joblib')
joblib.dump(xgb_clf, 'xgb_model.joblib')
joblib.dump(le, 'label_encoder.joblib')
print("\n--- Pipeline Finished ---")


Loading data from: textdescriptives_processed_prompt_examples_dataset.csv
Data loaded successfully.
Initial DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    2900 non-null   int64  
 1   prompt_example        2900 non-null   object 
 2   prompt_qual           2900 non-null   object 
 3   task_description      2900 non-null   object 
 4   complexity            2900 non-null   object 
 5   bad_prompt            1450 non-null   object 
 6   good_prompt           1450 non-null   object 
 7   expected_answer       2900 non-null   object 
 8   prompting_techniques  2900 non-null   object 
 9   prompt_type           2900 non-null   object 
 10  notes                 0 non-null      float64
 11  readability           2900 non-null   object 
 12  token_length          2900 non-null   object 
 13 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(mean_val, inplace=True)


Correlation matrix heatmap saved to feature_correlation_matrix.png

--- Principal Component Analysis (PCA) ---
Scaling numeric features for PCA...
Features scaled.
Performing PCA (n_components=None)...

PCA finished. Number of components fitted: 22
Explained Variance Ratio per Component:
  PC 1: 0.4446
  PC 2: 0.2115
  PC 3: 0.1046
  PC 4: 0.0661
  PC 5: 0.0423
  PC 6: 0.0353
  PC 7: 0.0262
  PC 8: 0.0248
  PC 9: 0.0141
  PC 10: 0.0088
  PC 11: 0.0080
  PC 12: 0.0053
  PC 13: 0.0046
  PC 14: 0.0020
  PC 15: 0.0010
  PC 16: 0.0007
  PC 17: 0.0000
  PC 18: 0.0000
  PC 19: 0.0000
  PC 20: 0.0000
  PC 21: 0.0000
  PC 22: 0.0000

Cumulative Explained Variance Ratio:
  Up to PC 1: 0.4446
  Up to PC 2: 0.6562
  Up to PC 3: 0.7607
  Up to PC 4: 0.8268
  Up to PC 5: 0.8692
  Up to PC 6: 0.9045
  Up to PC 7: 0.9306
  Up to PC 8: 0.9554
  Up to PC 9: 0.9695
  Up to PC 10: 0.9783
  Up to PC 11: 0.9864
  Up to PC 12: 0.9917
  Up to PC 13: 0.9963
  Up to PC 14: 0.9983
  Up to PC 15: 0.9993
  Up to P

Parameters: { "use_label_encoder" } are not used.



Model training complete.

--- Model Evaluation ---

Accuracy: 0.9862

Classification Report:
              precision    recall  f1-score   support

         bad       0.99      0.99      0.99       290
        good       0.99      0.99      0.99       290

    accuracy                           0.99       580
   macro avg       0.99      0.99      0.99       580
weighted avg       0.99      0.99      0.99       580


Confusion Matrix:
      Predicted bad  Predicted good
bad             286               4
good              4             286

--- Feature Importances (XGBoost) ---
                                    Feature  Importance
16               information_theory_entropy    0.569573
13      sentence_length_sentence_length_std    0.334469
7                           readability_rix    0.011083
11     sentence_length_sentence_length_mean    0.010337
3                   readability_gunning_fog    0.009633
14          coherence_first_order_coherence    0.008181
18   information_theor

In [22]:
import spacy
import textdescriptives as td
import pandas as pd
import numpy as np
import ast
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
# Assume joblib or pickle is used for loading
import joblib # Or import pickle

# --- 1. Load Prerequisites ---
# Load the spacy model with textdescriptives pipe
# Make sure you use the same model as in your training script!
try:
    nlp = spacy.load("en_core_web_sm")
    if "textdescriptives/all" not in nlp.pipe_names:
        nlp.add_pipe("textdescriptives/all")
except OSError:
    print("Downloading en_core_web_sm...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("textdescriptives/all")

# !!! Load your trained XGBoost model and LabelEncoder !!!
# Replace 'xgb_model.joblib' and 'label_encoder.joblib' with the actual paths
# You would need to add saving logic to your main training script first, e.g.:
# joblib.dump(xgb_clf, 'xgb_model.joblib')
# joblib.dump(le, 'label_encoder.joblib')
try:
    xgb_model = joblib.load('xgb_model.joblib')
    label_encoder = joblib.load('label_encoder.joblib')
    # !!! Load the expected feature columns !!!
    # You should save these from your training script as well
    # e.g., joblib.dump(X.columns, 'feature_columns.joblib')
    expected_columns = joblib.load('feature_columns.joblib')

except FileNotFoundError:
    print("Error: Model, LabelEncoder or Feature Columns file not found.")
    print("Please ensure you have saved these artifacts from your training script.")
    # As a fallback for demonstration, define dummy objects and columns
    # Replace these with actual loading if possible!
    xgb_model = None # Replace with actual loaded model
    label_encoder = LabelEncoder().fit(['bad', 'good']) # Dummy encoder
    # Define dummy expected columns based on the script logic (replace with actual loaded list)
    expected_columns = ['readability_flesch_reading_ease', 'readability_flesch_kincaid_grade',
                       'readability_smog', 'readability_gunning_fog', 'readability_automated_readability_index',
                       'readability_coleman_liau_index', 'readability_lix', 'readability_rix',
                       'token_length_n_tokens', 'token_length_n_unique_tokens', 'token_length_proportion_unique_tokens',
                       'sentence_length_mean', 'sentence_length_median', 'sentence_length_std',
                       'coherence_first_person_pronouns', 'coherence_third_person_pronouns',
                       'information_theory_entropy', 'information_theory_perplexity',
                       'information_theory_per_word_perplexity', 'entropy', 'perplexity', 'per_word_perplexity']
    print("Using dummy model/encoder/columns for demonstration. Replace with loaded objects.")
except Exception as e:
    print(f"An error occurred loading objects: {e}")
    exit()


# --- 2. Define Your Prompt ---
my_prompt = "You are a 4 year old child who has no ability to understand complex scientific topics. Please produce an accurate and scientifically valid PhD thesis on the relationship between quantum physics and superconductors at 0K."
# my_prompt = "green green green sleep sleep sleep furiously furiously furiously idea idea idea"
# my_prompt = "Quantum xylophones serendipitously calibrate fluorescent algorithms beneath juxtaposed moonlight, whispering ephemeral paradoxes."
# my_prompt = "Compare and contrast renewable and non-renewable energy sources, highlighting key similarities and differences."
# my_prompt = "Your task is to compare and contrast renewable and non-renewable energy sources, highlighting key similarities and differences."
# my_prompt = "You are a teacher of physics. Your task is to compare and contrast renewable and non-renewable energy sources, highlighting key similarities and differences."
# my_prompt = "tell me stuff" # Example of potentially 'bad' prompt


# --- 3. Extract Features ---
print(f"Processing prompt: '{my_prompt}'")
doc = nlp(my_prompt)

# Extract features into a dictionary
features_dict = {
    "readability": doc._.readability,
    "token_length": doc._.token_length,
    "sentence_length": doc._.sentence_length,
    "coherence": doc._.coherence,
    "information_theory": doc._.information_theory,
    # Note: entropy, perplexity etc. might be directly under doc._ or nested
    # Adjust based on how textdescriptives structures them
    "entropy": getattr(doc._, 'entropy', None), # Safely get attributes
    "perplexity": getattr(doc._, 'perplexity', None),
    "per_word_perplexity": getattr(doc._, 'per_word_perplexity', None)
}

# --- 4. Flatten and Prepare Features for Model ---
flattened_features = {}
for key, value in features_dict.items():
    if isinstance(value, dict):
        # Handle potential 'nan' before processing keys
        value_str = str(value).replace('nan', 'None')
        try:
            # Use literal_eval carefully on the cleaned string
            evaluated_value = ast.literal_eval(value_str)
            for sub_key, sub_value in evaluated_value.items():
                 # Replace potential None values resulting from 'nan' with np.nan for numeric processing
                flattened_features[f"{key}_{sub_key}"] = float(sub_value) if sub_value is not None else np.nan
        except (ValueError, SyntaxError, TypeError) as e:
            print(f"Warning: Could not parse dict feature '{key}'. Error: {e}")
            # Add keys with NaN if parsing fails but key structure is known/expected
            # This part might need adjustment based on exact expected sub-keys
            flattened_features[f"{key}_sub_key_placeholder"] = np.nan
    elif isinstance(value, (int, float)):
         flattened_features[key] = value
    elif value is None:
         flattened_features[key] = np.nan # Handle None values explicitly
    else:
         print(f"Warning: Skipping non-dict/numeric feature '{key}' of type {type(value)}")


# Create a DataFrame with a single row
input_df = pd.DataFrame([flattened_features])

# --- 5. Align Columns with Training Data ---
# Ensure the DataFrame has exactly the same columns as the training data
# Add missing columns (that were in training data) and fill with 0 or mean
missing_cols = set(expected_columns) - set(input_df.columns)
for c in missing_cols:
    input_df[c] = 0 # Or use a saved mean/median from training data if available

# Remove extra columns (that were not in training data)
extra_cols = set(input_df.columns) - set(expected_columns)
input_df = input_df.drop(columns=list(extra_cols))

# Reorder columns to match the training order
input_df = input_df[expected_columns]

# --- 6. Handle Missing Values (if any) ---
# Use the same strategy as in training (e.g., fill with 0 or saved mean)
# For simplicity here, fill remaining NaNs with 0
if input_df.isnull().sum().sum() > 0:
    print("Warning: Filling NaN values with 0 for prediction.")
    input_df.fillna(0, inplace=True)


# --- 7. Predict ---
if xgb_model is not None:
    try:
        prediction_encoded = xgb_model.predict(input_df)
        prediction_proba = xgb_model.predict_proba(input_df)

        # Decode the prediction
        predicted_label = label_encoder.inverse_transform(prediction_encoded)[0]
        confidence_good = prediction_proba[0][list(label_encoder.classes_).index('good')] # Probability of 'good'

        print("\n--- Prediction Result ---")
        print(f"Predicted Quality: {predicted_label.upper()}")
        print(f"Confidence (Good): {confidence_good:.4f}")
        print(f"Confidence (Bad): {1.0 - confidence_good:.4f}")

    except Exception as e:
        print(f"\nAn error occurred during prediction: {e}")
        print("Ensure the input data has the correct format and columns.")
else:
    print("\nCannot predict: XGBoost model was not loaded.")

print("\nSnippet finished.")

Processing prompt: 'You are a 4 year old child who has no ability to understand complex scientific topics. Please produce an accurate and scientifically valid PhD thesis on the relationship between quantum physics and superconductors at 0K.'

--- Prediction Result ---
Predicted Quality: GOOD
Confidence (Good): 0.6928
Confidence (Bad): 0.3072

Snippet finished.


  similarities.append(sent.similarity(sents[i + order]))
