In [6]:
# create_test_data.py
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split

def create_test_data():
    """Create test data from balanced dataset"""
    
    # Load balanced dataset
    balanced_df = pd.read_csv("balanced_dataset/balanced_dataset.csv")
    
    # Auto-detect columns
    label_candidates = ['label', 'category', 'class', 'target', 'industry', 'industry_name']
    text_candidates = ['text', 'content', 'summary', 'description', 'full_summary']
    
    label_col = None
    text_col = None
    
    for candidate in label_candidates:
        if candidate in balanced_df.columns:
            label_col = candidate
            break
    
    for candidate in text_candidates:
        if candidate in balanced_df.columns:
            text_col = candidate
            break
    
    if not label_col or not text_col:
        print("Could not auto-detect columns")
        print(f"Available columns: {balanced_df.columns.tolist()}")
        return False
    
    print(f"Using label column: {label_col}")
    print(f"Using text column: {text_col}")
    
    # Split into train and test
    train_df, test_df = train_test_split(
        balanced_df, 
        test_size=0.2, 
        random_state=42, 
        stratify=balanced_df[label_col]
    )
    
    # Create directories if they don't exist
    os.makedirs("train_test_data", exist_ok=True)
    os.makedirs("balanced_dataset", exist_ok=True)
    
    # Save train and test data
    train_df.to_csv("train_test_data/train_data.csv", index=False)
    test_df.to_csv("train_test_data/test_data.csv", index=False)
    balanced_df.to_csv("balanced_dataset/balanced_dataset.csv", index=False)
    
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    print(f"Train data saved to: train_test_data/train_data.csv")
    print(f"Test data saved to: train_test_data/test_data.csv")
    
    return True

if __name__ == "__main__":
    create_test_data()

Using label column: industry
Using text column: full_summary
Train data shape: (217, 12)
Test data shape: (55, 12)
Train data saved to: train_test_data/train_data.csv
Test data saved to: train_test_data/test_data.csv
