## Company classifier notebook

This is for experimenting with the company fit classifier.
Visualizations, etc can go here.

In [3]:
print("hello world")

hello world


### Setup and Imports

In [5]:
import sys
import os
# Add parent directory to Python path so we can import company_classifier
sys.path.append(os.path.dirname(os.getcwd()))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from company_classifier.classifier import (
    BAD_FIT,
    GOOD_FIT,
    NEED_MORE_INFO,
    CompanyClassifier,
)

### Load and Prepare Data

In [7]:
def load_real_data(csv_path: str) -> tuple[pd.DataFrame, np.ndarray]:
    """Load and prepare real company data."""
    df = pd.read_csv(csv_path)

    # Map text categories to numeric values
    category_map = {"good": GOOD_FIT, "bad": BAD_FIT, "needs_more_info": NEED_MORE_INFO}

    # Extract features used by our classifier
    X = df[
        [
            "type",
            "total_comp",
            "base",
            "rsu",
            "bonus",
            "remote_policy",
            "eng_size",
            "total_size",
        ]
    ]

    # Convert fit categories to numeric values
    y = df["fit_category"].map(category_map).to_numpy()

    return X, y

# Load the data
X, y = load_real_data("../company_ratings.csv")

# Show dataset info
print(f"Dataset size: {len(X)} companies\n")
print("Class distribution:")
classes, counts = np.unique(y, return_counts=True)
for cls, count in zip(classes, counts):
    label = {
        GOOD_FIT: "Good fit",
        BAD_FIT: "Bad fit",
        NEED_MORE_INFO: "Need more info",
    }[cls]
    print(f"{label}: {count}")

Dataset size: 47 companies

Class distribution:
Bad fit: 9
Good fit: 22
Need more info: 16


### Train-Test Split and Initial Evaluation

### Cross-Validation Analysis

### Feature Importance Analysis
