In [1]:
import pandas as pd
import numpy as np
import random

In [3]:
df = pd.read_csv("airflow/data/job.csv")
df_original = df.copy()

print("Loaded dataset. Shape:", df.shape)
df.head()


Loaded dataset. Shape: (19158, 14)


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


In [4]:
ERROR_RATE = 0.15   # identical to split_dataset.py

def inject_errors_job_dataset(df_chunk):
    """
    Inject realistic bad rows for the JOB dataset.
    Bad rows CAN contain NaN / inf / wrong types.
    Good rows remain clean.
    """
    df = df_chunk.copy()
    n = len(df)

    if n == 0:
        return df, []

    rows_to_corrupt = random.sample(range(n), max(1, int(n * ERROR_RATE)))
    errors_log = []

    for idx in rows_to_corrupt:
        error_type = random.choice([
            "nan_value",
            "inf_value",
            "bad_gender",
            "bad_experience",
            "negative_hours",
            "string_in_numeric",
            "huge_outlier",
        ])

        # ERROR 1 — Introduce NaN
        if error_type == "nan_value":
            col = random.choice(["city_development_index", "training_hours"])
            if col in df.columns:
                df.loc[idx, col] = np.nan
                errors_log.append((idx, error_type, col, "Inserted NaN"))

        # ERROR 2 — Introduce INF
        elif error_type == "inf_value":
            col = random.choice(["city_development_index", "training_hours"])
            if col in df.columns:
                df.loc[idx, col] = np.inf
                errors_log.append((idx, error_type, col, "Inserted INF"))

        # ERROR 3 — Invalid gender
        elif error_type == "bad_gender":
            if "gender" in df.columns:
                df.loc[idx, "gender"] = "UnknownGender"
                errors_log.append((idx, error_type, "gender", "Bad gender value"))

        # ERROR 4 — Invalid relevant experience
        elif error_type == "bad_experience":
            if "relevent_experience" in df.columns:
                df.loc[idx, "relevent_experience"] = "Experience??"
                errors_log.append((idx, error_type, "relevent_experience", "Bad experience category"))

        # ERROR 5 — Negative training hours
        elif error_type == "negative_hours":
            if "training_hours" in df.columns:
                df.loc[idx, "training_hours"] = -10
                errors_log.append((idx, error_type, "training_hours", "Negative hours"))

        # ERROR 6 — String in numeric
        elif error_type == "string_in_numeric":
            if "training_hours" in df.columns:
                df.loc[idx, "training_hours"] = "wrong_value"
                errors_log.append((idx, error_type, "training_hours", "String in numeric column"))

        # ERROR 7 — Huge unrealistic outlier
        elif error_type == "huge_outlier":
            if "city_development_index" in df.columns:
                df.loc[idx, "city_development_index"] = 1000
                errors_log.append((idx, error_type, "city_development_index", "Extreme outlier"))

    return df, errors_log


In [5]:
df_errors, errors_log = inject_errors_job_dataset(df)

print("Injected errors into", len(errors_log), "rows.")
pd.DataFrame(errors_log, columns=["RowIndex", "ErrorType", "Column", "Description"])


  df.loc[idx, "training_hours"] = "wrong_value"


Injected errors into 2873 rows.


Unnamed: 0,RowIndex,ErrorType,Column,Description
0,3588,bad_experience,relevent_experience,Bad experience category
1,4187,bad_gender,gender,Bad gender value
2,19142,nan_value,training_hours,Inserted NaN
3,2619,bad_gender,gender,Bad gender value
4,10119,inf_value,city_development_index,Inserted INF
...,...,...,...,...
2868,14100,inf_value,city_development_index,Inserted INF
2869,6441,huge_outlier,city_development_index,Extreme outlier
2870,13787,negative_hours,training_hours,Negative hours
2871,404,bad_gender,gender,Bad gender value


In [6]:
corrupted_indices = sorted(set([row[0] for row in errors_log]))

comparison = pd.concat([
    df_original.loc[corrupted_indices].add_prefix("original_"),
    df_errors.loc[corrupted_indices].add_prefix("corrupted_")
], axis=1)

comparison.head(20)


Unnamed: 0,original_enrollee_id,original_city,original_city_development_index,original_gender,original_relevent_experience,original_enrolled_university,original_education_level,original_major_discipline,original_experience,original_company_size,...,corrupted_relevent_experience,corrupted_enrolled_university,corrupted_education_level,corrupted_major_discipline,corrupted_experience,corrupted_company_size,corrupted_company_type,corrupted_last_new_job,corrupted_training_hours,corrupted_target
6,28806,city_160,0.92,Male,Has relevent experience,no_enrollment,High School,,5,50-99,...,Has relevent experience,no_enrollment,High School,,5,50-99,Funded Startup,1,-10,0
19,11399,city_13,0.827,Female,Has relevent experience,no_enrollment,Graduate,Arts,4,,...,Has relevent experience,no_enrollment,Graduate,Arts,4,,,1,132.0,1
20,31972,city_159,0.843,Male,Has relevent experience,no_enrollment,Masters,STEM,11,100-500,...,Has relevent experience,no_enrollment,Masters,STEM,11,100-500,Pvt Ltd,1,wrong_value,0
38,8612,city_103,0.92,,No relevent experience,no_enrollment,Graduate,STEM,12,,...,No relevent experience,no_enrollment,Graduate,STEM,12,,,4,50.0,0
40,2547,city_114,0.926,Female,Has relevent experience,Full time course,Masters,STEM,16,1000-4999,...,Experience??,Full time course,Masters,STEM,16,1000-4999,Public Sector,2,14.0,0
56,25296,city_73,0.754,Male,Has relevent experience,Full time course,Graduate,STEM,2,50-99,...,Has relevent experience,Full time course,Graduate,STEM,2,50-99,Early Stage Startup,2,52.0,0
62,24690,city_41,0.827,,Has relevent experience,,Masters,STEM,13,<10,...,Has relevent experience,,Masters,STEM,13,<10,,1,15.0,0
63,8433,city_100,0.887,Male,Has relevent experience,no_enrollment,Masters,Humanities,>20,100-500,...,Has relevent experience,no_enrollment,Masters,Humanities,>20,100-500,Pvt Ltd,>4,inf,0
83,25413,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,Oct-49,...,Has relevent experience,no_enrollment,Graduate,STEM,5,Oct-49,,1,27.0,0
87,28751,city_103,0.92,,No relevent experience,Full time course,High School,,4,,...,No relevent experience,Full time course,High School,,4,,,3,75.0,0


In [7]:
df_errors.head(20)


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83.0,0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52.0,1
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8.0,0
5,21651,city_176,0.764,,Has relevent experience,Part time course,Graduate,STEM,11,,,1,24.0,1
6,28806,city_160,0.92,Male,Has relevent experience,no_enrollment,High School,,5,50-99,Funded Startup,1,-10.0,0
7,402,city_46,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,<10,Pvt Ltd,>4,18.0,1
8,27107,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,50-99,Pvt Ltd,1,46.0,1
9,699,city_103,0.92,,Has relevent experience,no_enrollment,Graduate,STEM,17,10000+,Pvt Ltd,>4,123.0,0
