In [None]:
%pip install pandas tqdm pdfplumber python-docx

In [None]:
%pip install kaggle kagglehub pandas-datasets

In [3]:
class SchemaDriftError(Exception):
    """Raised when schema drift is detected."""
    pass


In [7]:
import os
import json
import pandas as pd
from typing import List, Dict, Any
from kaggle.api.kaggle_api_extended import KaggleApi

def authenticate_kaggle():
    api = KaggleApi()
    api.authenticate()
    return api

def download_kaggle_dataset(api, dataset_slug: str, download_dir: str):
    os.makedirs(download_dir, exist_ok=True)
    dataset_name = dataset_slug.split("/")[-1]
    zip_path = os.path.join(download_dir, f"{dataset_name}.zip")
    extract_path = os.path.join(download_dir, dataset_name)

    if os.path.exists(extract_path):
        print(f"✅ Dataset already extracted at '{extract_path}', skipping download.")
        return extract_path

    print(f"⬇️ Downloading dataset '{dataset_slug}'...")
    api.dataset_download_files(dataset_slug, path=download_dir, quiet=False)

    import zipfile
    print(f"📦 Extracting dataset '{zip_path}'...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"✅ Extraction completed at '{extract_path}'.")
    
    return extract_path

def find_csv_file(root_path: str, target_csv: str) -> str:
    for dirpath, _, filenames in os.walk(root_path):
        if target_csv in filenames:
            full_path = os.path.join(dirpath, target_csv)
            print(f"✅ Found CSV at: {full_path}")
            return full_path
    raise FileNotFoundError(f"❌ CSV file '{target_csv}' not found inside '{root_path}'.")

def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame:
    print(f"📖 Loading CSV file '{csv_path}'...")
    df = pd.read_csv(csv_path)
    print(f"✅ CSV loaded successfully. Shape: {df.shape}")
    return df

def infer_schema(df: pd.DataFrame) -> Dict[str, Any]:
    """Infer schema as {column_name: expected_type}"""
    schema = {}
    for col in df.columns:
        dtype = df[col].dropna().infer_objects().dtype
        if pd.api.types.is_string_dtype(dtype):
            schema[col] = str
        elif pd.api.types.is_integer_dtype(dtype):
            schema[col] = int
        elif pd.api.types.is_float_dtype(dtype):
            schema[col] = float
        elif pd.api.types.is_bool_dtype(dtype):
            schema[col] = bool
        else:
            schema[col] = str  # fallback: treat unknown types as str
    print(f"🔍 Inferred Schema: {schema}")
    return schema

def dataframe_to_json_records(df: pd.DataFrame) -> List[dict]:
    df = df.fillna('')
    records = df.to_dict(orient='records')
    print(f"✅ Converted DataFrame to {len(records)} JSON records.")
    return records

def validate_json_against_schema(records: List[dict], schema: Dict[str, Any]):
    """Validate each record against inferred schema."""
    for idx, record in enumerate(records):
        for field, expected_type in schema.items():
            if field not in record:
                raise ValueError(f"❌ Record {idx}: Missing field '{field}'.")

            value = record[field]

            # Allow empty string as null (acceptable for optional fields)
            if value == '' or value is None:
                continue

            # Some values from CSV come as wrong type (e.g., numbers as strings)
            if not isinstance(value, expected_type):
                try:
                    # Try to cast
                    expected_type(value)
                except (ValueError, TypeError):
                    raise TypeError(f"❌ Record {idx}: Field '{field}' expected {expected_type.__name__}, got {type(value).__name__} with value '{value}'.")
    print("✅ Schema validation passed for all records.")

def save_json(data: List[dict], filepath: str):
    if os.path.exists(filepath):
        os.remove(filepath)
        print(f"🧹 Old JSON file '{filepath}' deleted.")
    
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"✅ New JSON file saved to '{filepath}'.")

def download_and_load_kaggle_dataset(api, dataset_slug: str, download_dir: str = "datasets", csv_filename: str = None) -> pd.DataFrame:
    if csv_filename is None:
        raise ValueError("You must specify the CSV filename to load.")

    extract_path = download_kaggle_dataset(api, dataset_slug, download_dir)
    csv_path = find_csv_file(extract_path, csv_filename)
    df = load_csv_to_dataframe(csv_path)
    return df

def download_and_load_multiple_datasets(dataset_info: list, download_dir: str = "datasets") -> dict:
    api = authenticate_kaggle()
    loaded_datasets = {}

    for item in dataset_info:
        print(f"\n🚀 Processing dataset: {item['slug']}")
        df = download_and_load_kaggle_dataset(api, item['slug'], download_dir, item['csv'])
        dataset_key = item.get('key', item['slug'].split('/')[-1])
        loaded_datasets[dataset_key] = df
    
    print("\n✅ All datasets loaded successfully.")
    return loaded_datasets

def convert_and_save_datasets_with_schema(datasets: dict, output_dir: str = "json_outputs") -> dict:
    os.makedirs(output_dir, exist_ok=True)
    all_json_data = {}

    for key, df in datasets.items():
        print(f"\n🚀 Converting dataset '{key}' to JSON with schema validation...")
        schema = infer_schema(df)
        json_data = dataframe_to_json_records(df)
        validate_json_against_schema(json_data, schema)

        json_path = os.path.join(output_dir, f"{key}.json")
        save_json(json_data, json_path)
        
        all_json_data[key] = json_data
    
    print("\n✅ All datasets converted, validated with schema inference, and saved.")
    return all_json_data

def save_schema(schema: Dict[str, Any], schema_path: str):
    os.makedirs(os.path.dirname(schema_path), exist_ok=True)
    schema_serializable = {k: v.__name__ for k, v in schema.items()}
    with open(schema_path, 'w', encoding='utf-8') as f:
        json.dump(schema_serializable, f, indent=4)
    print(f"✅ Baseline schema saved at '{schema_path}'.")

def load_schema(schema_path: str) -> Dict[str, Any]:
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema_data = json.load(f)
    schema = {k: eval(v) for k, v in schema_data.items()}
    return schema

def compare_schemas(baseline: Dict[str, Any], current: Dict[str, Any]) -> List[str]:
    """Return list of drift messages."""
    drift_messages = []

    baseline_keys = set(baseline.keys())
    current_keys = set(current.keys())

    missing_keys = baseline_keys - current_keys
    extra_keys = current_keys - baseline_keys

    if missing_keys:
        drift_messages.append(f"❌ Missing fields in new data: {missing_keys}")
    if extra_keys:
        drift_messages.append(f"⚠️ Extra new fields in new data: {extra_keys}")

    for field in baseline_keys & current_keys:
        if baseline[field] != current[field]:
            drift_messages.append(f"❌ Field '{field}' type mismatch: baseline {baseline[field].__name__}, current {current[field].__name__}")

    return drift_messages


def convert_and_save_datasets_with_schema_and_drift_detection(
    datasets: dict, 
    output_dir: str = "json_outputs", 
    schema_dir: str = "schemas",
    exclude_fields: dict = None  # NEW PARAMETER
) -> dict:
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(schema_dir, exist_ok=True)
    all_json_data = {}

    for key, df in datasets.items():
        print(f"\n🚀 Converting dataset '{key}' to JSON with schema validation, drift detection, and field filtering...")

        # 🔥 Exclude specified fields if needed
        if exclude_fields and key in exclude_fields:
            fields_to_exclude = exclude_fields[key]
            df = df.drop(columns=fields_to_exclude, errors='ignore')
            print(f"🧹 Excluded fields from '{key}': {fields_to_exclude}")

        current_schema = infer_schema(df)
        json_data = dataframe_to_json_records(df)

        schema_path = os.path.join(schema_dir, f"{key}_schema.json")
        
        if os.path.exists(schema_path):
            baseline_schema = load_schema(schema_path)
            drift_messages = compare_schemas(baseline_schema, current_schema)
            if drift_messages:
                for msg in drift_messages:
                    print(msg)
                raise SchemaDriftError(f"❌ Schema drift detected in dataset '{key}'. Please review changes.")
            else:
                print(f"✅ No schema drift detected for dataset '{key}'.")
        else:
            print(f"🆕 No baseline schema found for '{key}'. Saving new baseline.")
            save_schema(current_schema, schema_path)

        validate_json_against_schema(json_data, current_schema)

        json_path = os.path.join(output_dir, f"{key}.json")
        save_json(json_data, json_path)
        
        all_json_data[key] = json_data
    
    print("\n✅ All datasets converted, validated, checked for schema drift, and fields filtered.")
    return all_json_data



In [5]:
dataset_info = [
    {
        "slug": "snehaanbhawal/resume-dataset",
        "csv": "Resume.csv",
        "key": "resume_data"
    },
    {
        "slug": "arshkon/linkedin-job-postings",
        "csv": "postings.csv",
        "key": "job_postings"
    }
]
#datasets = download_and_load_multiple_datasets(dataset_info)
datasets = download_and_load_multiple_datasets(dataset_info)



🚀 Processing dataset: snehaanbhawal/resume-dataset
✅ Dataset already extracted at 'datasets\resume-dataset', skipping download.
✅ Found CSV at: datasets\resume-dataset\Resume\Resume.csv
📖 Loading CSV file 'datasets\resume-dataset\Resume\Resume.csv'...
✅ CSV loaded successfully. Shape: (2484, 4)

🚀 Processing dataset: arshkon/linkedin-job-postings
✅ Dataset already extracted at 'datasets\linkedin-job-postings', skipping download.
✅ Found CSV at: datasets\linkedin-job-postings\postings.csv
📖 Loading CSV file 'datasets\linkedin-job-postings\postings.csv'...
✅ CSV loaded successfully. Shape: (123849, 31)

✅ All datasets loaded successfully.


In [9]:
#json_structures = convert_and_save_datasets_with_schema(datasets)
exclude_fields = {
    "resume_data": ["Resume_html"],   # Exclude heavy HTML field only for resume dataset
    "job_postings": []                # Nothing to exclude for now in JD dataset
}
json_structures = convert_and_save_datasets_with_schema_and_drift_detection(datasets, exclude_fields=exclude_fields)


🚀 Converting dataset 'resume_data' to JSON with schema validation, drift detection, and field filtering...
🧹 Excluded fields from 'resume_data': ['Resume_html']
🔍 Inferred Schema: {'ID': <class 'int'>, 'Resume_str': <class 'str'>, 'Category': <class 'str'>}
✅ Converted DataFrame to 2484 JSON records.
🆕 No baseline schema found for 'resume_data'. Saving new baseline.
✅ Baseline schema saved at 'schemas\resume_data_schema.json'.
✅ Schema validation passed for all records.
🧹 Old JSON file 'json_outputs\resume_data.json' deleted.
✅ New JSON file saved to 'json_outputs\resume_data.json'.

🚀 Converting dataset 'job_postings' to JSON with schema validation, drift detection, and field filtering...
🧹 Excluded fields from 'job_postings': []
🔍 Inferred Schema: {'job_id': <class 'int'>, 'company_name': <class 'str'>, 'title': <class 'str'>, 'description': <class 'str'>, 'max_salary': <class 'float'>, 'pay_period': <class 'str'>, 'location': <class 'str'>, 'company_id': <class 'float'>, 'views': <