# 1. Data cleaning and exploration

Use these cells to scan everything under `data/archive/` so we know what to clean and how to shape a loader later.

## Setup and helpers

- Reads each CSV with a row cap to stay memory-safe.
- Shows basic column profile (dtype, null %, unique counts).
- Plots a quick numeric histogram and top categories for the first available categorical column.

In [7]:
from pathlib import Path
from IPython.display import display

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')

archive_dir = Path('../data/archive')
sample_rows = 150_000  # adjust if you need more/less for big files

def read_csv_sample(path: Path, nrows: int = sample_rows) -> pd.DataFrame:
    """Read CSV with a row limit to avoid blowing up memory."""
    return pd.read_csv(path, nrows=nrows)

def profile(df: pd.DataFrame) -> pd.DataFrame:
    meta = pd.DataFrame({
        'dtype': df.dtypes,
        'non_null': df.notna().sum(),
        'null_pct': (df.isna().mean() * 100).round(2),
        'unique': df.nunique()
    })
    return meta.sort_values('null_pct', ascending=False)

def plot_quick(df: pd.DataFrame, title: str) -> None:
    num_cols = df.select_dtypes(include='number').columns.tolist()
    cat_cols = df.select_dtypes(exclude='number').columns.tolist()

    if num_cols:
        col = num_cols[0]
        plt.figure(figsize=(8, 3))
        sns.histplot(df[col].dropna(), bins=40)
        plt.title(f'{title} - {col}')
        plt.tight_layout()
        plt.show()

    if cat_cols:
        col = cat_cols[0]
        top = df[col].value_counts().head(15)
        plt.figure(figsize=(8, 4))
        sns.barplot(x=top.values, y=top.index, orient='h')
        plt.title(f'{title} - top {col}')
        plt.xlabel('count')
        plt.tight_layout()
        plt.show()


## List archive files

Quick inventory with file sizes to gauge what to sample.

In [9]:
csv_files = sorted(archive_dir.rglob('*.csv'))
print(f'Found {len(csv_files)} CSV files under {archive_dir}')
for p in csv_files:
    size_mb = p.stat().st_size / 1e6
    rel = str(p.relative_to(archive_dir))
    print(f'{rel:35} {size_mb:8.1f} MB')


Found 11 CSV files under ..\data\archive


TypeError: unsupported format string passed to WindowsPath.__format__

## Per-file quick profile

Loops through every CSV: sample rows, preview head, column profile, and quick plots. Increase `sample_rows` if needed.

In [None]:
for p in csv_files:
    print(f"\n=== {p.relative_to(archive_dir)} ===")
    df = read_csv_sample(p)
    print('shape (sample):', df.shape)
    display(df.head())
    display(profile(df))
    plot_quick(df, p.stem)


## Key tables to feed loader.py

Load samples of the main tables we will likely join (`postings`, `companies`, `jobs` components, `mappings`) to inspect column names and keys for cleaning.

In [None]:
key_paths = {
    'postings': archive_dir / 'postings.csv',
    'companies': archive_dir / 'companies/companies.csv',
    'company_industries': archive_dir / 'companies/company_industries.csv',
    'company_specialities': archive_dir / 'companies/company_specialities.csv',
    'jobs_benefits': archive_dir / 'jobs/benefits.csv',
    'jobs_industries': archive_dir / 'jobs/job_industries.csv',
    'jobs_skills': archive_dir / 'jobs/job_skills.csv',
    'jobs_salaries': archive_dir / 'jobs/salaries.csv',
    'map_industries': archive_dir / 'mappings/industries.csv',
    'map_skills': archive_dir / 'mappings/skills.csv',
}

frames = {}
for name, path in key_paths.items():
    if not path.exists():
        print(f'Missing: {name} at {path}')
        continue
    nrows = sample_rows if name == 'postings' else min(sample_rows, 50_000)
    frames[name] = read_csv_sample(path, nrows=nrows)
    print(f'{name}: {frames[name].shape} from {path.name}')
    display(frames[name].head())
    display(profile(frames[name]))


## Next steps for cleaning

- Decide join keys (e.g., `job_id`, `company_id`) from the previews above.
- Drop or fill high-null columns; deduplicate by ids.
- Standardize text fields (`job_title`, `description`, `skills`) for vectorization.
- Save a cleaned, joined table to `data/processed/clean_jobs.csv` for use in `src/loader.py`.