In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import csv

# =========================================================================
# === STEP 0: DEFINE FILE PATH HERE =======================================
# =========================================================================
# File path from your execution history
CSV_FILE_PATH = "/content/conversations_large.csv"
# =========================================================================

# --- STEP 1: Load Data with Robust Parsing (Fixes ParserError) ---
try:
    # Attempt to load using the robust 'python' engine, ignoring bad quoting
    df = pd.read_csv(
        CSV_FILE_PATH,
        engine='python',
        encoding='utf-8',
        quoting=csv.QUOTE_NONE,
        on_bad_lines='skip' # Skip lines that cause issues
    )
except Exception:
    # Second attempt: Try latin1 encoding with robust parsing
    try:
        df = pd.read_csv(
            CSV_FILE_PATH,
            engine='python',
            encoding='latin1',
            quoting=csv.QUOTE_NONE,
            on_bad_lines='skip'
        )
    except Exception as e:
        print(f"Error: Final attempt to load the file failed. Error details: {e}")
        raise

# --- CRITICAL FIX: CLEAN COLUMN NAMES (Fixes KeyError) ---
df.columns = df.columns.str.strip().str.lower().str.replace('[^a-z0-9_]+', '_', regex=True)

# --- STEP 2: Preprocessing and Feature Engineering ---

TARGET_COLUMN = 'outcome'

# Ensure the target column is present
if TARGET_COLUMN not in df.columns:
    print(f"Error: Target column '{TARGET_COLUMN}' is missing after cleaning. Aborting.")
    raise KeyError(f"Target column '{TARGET_COLUMN}' is missing.")

# Encode 'outcome' into a binary numerical format (Needed for Stratification)
le = LabelEncoder()
df['outcome_encoded'] = le.fit_transform(df[TARGET_COLUMN])
print(f"Original dataset size: {len(df)} rows.")

# --- STEP 2.5: Handle Rare Classes (Fixes ValueError) ---
# Identify classes with only 1 member
class_counts = df['outcome_encoded'].value_counts()
rare_classes = class_counts[class_counts < 2].index.tolist()

if rare_classes:
    # Filter the DataFrame to keep only rows from well-populated classes
    df_filtered = df[~df['outcome_encoded'].isin(rare_classes)].copy()
    print(f"Removed {len(df) - len(df_filtered)} row(s) corresponding to single-member outcome classes.")
    df = df_filtered

# --- Continue Feature Engineering on the filtered DataFrame ---

# Define columns to exclude from features
COLUMNS_TO_EXCLUDE = [
    'conv_id', 'session_id', 'customer_name', 'agent_name',
    'timestamp', 'text', 'masked_order_id', TARGET_COLUMN
]

# Dynamically filter the list to only drop columns that exist in the DataFrame
COLUMNS_TO_DROP_ACTUAL = [col for col in COLUMNS_TO_EXCLUDE if col in df.columns]

# Separate features (X) and target (y)
X = df.drop(columns=COLUMNS_TO_DROP_ACTUAL + ['outcome_encoded'])
y = df['outcome_encoded'] # y is now filtered and safe for stratification

# Impute missing values in 'coupon_code'
if 'coupon_code' in X.columns:
    X['coupon_code'] = X['coupon_code'].fillna('NO_CODE')

# Identify columns for processing
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Get Dummies for all categorical features
X_processed = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Scale numerical columns
scaler = StandardScaler()
features_to_scale = [f for f in numerical_features if f in X_processed.columns]
if features_to_scale:
    X_processed[features_to_scale] = scaler.fit_transform(X_processed[features_to_scale])


# --- STEP 3: Train-Test Split (Now Safe with Stratify) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42, stratify=y
)

print(f"\nFinal training set size: {X_train.shape[0]} samples")
print(f"Final testing set size: {X_test.shape[0]} samples")


# --- STEP 4: Model Training ---
model = LogisticRegression(solver='liblinear', random_state=42)
print("\n--- Training Logistic Regression Model ---")
model.fit(X_train, y_train)


# --- STEP 5: Model Evaluation ---
y_pred = model.predict(X_test)
target_names = le.classes_ # Use original classes for reporting (some might not be in the final data)
target_names_filtered = [le.inverse_transform([c])[0] for c in np.unique(y_test)]


print("\n--- Evaluation on Test Set ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names_filtered))




Original dataset size: 26074 rows.
Removed 1 row(s) corresponding to single-member outcome classes.

Final training set size: 18251 samples
Final testing set size: 7822 samples

--- Training Logistic Regression Model ---

--- Evaluation on Test Set ---
Accuracy: 0.74

Classification Report:
                         precision    recall  f1-score   support

     Closed - No Action       0.35      0.34      0.35       614
              Escalated       0.32      0.35      0.33       628
       Pending Customer       0.34      0.31      0.32       579
         Pending Vendor       0.30      0.31      0.31       604
               Resolved       0.31      0.30      0.31       569
          activate_esim       0.99      0.99      0.99       399
         api_rate_limit       0.97      0.98      0.98       266
           apply_coupon       1.00      1.00      1.00       371
       book_appointment       1.00      1.00      1.00       322
           cancel_order       0.99      0.97      0.98   

In [2]:

print("\n" + "="*50)
print("1. DATA HEAD AND INFO")
print("="*50)

# Display the first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head().to_markdown(index=False))

# Display column info, data types, and non-null counts
print("\nColumn Information (Data Types and Missing Values):")
df.info()

# Display summary statistics for numerical columns
print("\nSummary Statistics for Numerical Columns:")
print(df.describe().T.to_markdown())



1. DATA HEAD AND INFO

First 5 rows of the dataset:
|   conv_id | turn_index   | role                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | text                                                                                                                                                                                                                                                                                                                                                                                                                                                