🔎 Step 1 — Load & basic inspection

In [None]:
import pandas as pd
from pathlib import Path
%run ./hidden.ipynb

# Load Sri Lanka dataset
df_sl = pd.read_csv(datasets_folder_path/"raw/IT_Job_Dataset_SriLanka_20000 (1).csv", low_memory=False)

print("Shape:", df_sl.shape)
print("Columns:", df_sl.columns.tolist())
print("\nData types:\n", df_sl.dtypes)

# Missingness summary
missing_summary_sl = (
    df_sl.isna()
         .sum()
         .to_frame("num_missing")
         .assign(pct_missing=lambda x: (x["num_missing"]/len(df_sl)).round(4))
         .sort_values("pct_missing", ascending=False)
)
print("\n=== Missingness summary ===")
print(missing_summary_sl.to_string())

# Fully duplicated rows
num_dups_full = df_sl.duplicated().sum()
print("\nFully duplicated rows:", num_dups_full)

Shape: (20000, 5)
Columns: ['Job Role', 'Job Title', 'Student Qualification', 'Student Skills', 'Job Description']

Data types:
 Job Role                 object
Job Title                object
Student Qualification    object
Student Skills           object
Job Description          object
dtype: object

=== Missingness summary ===
                       num_missing  pct_missing
Job Role                         0          0.0
Job Title                        0          0.0
Student Qualification            0          0.0
Student Skills                   0          0.0
Job Description                  0          0.0

Fully duplicated rows: 31


🔎 Step 2.1 — Unique value distributions

For useful categoricals in Sri Lanka dataset:
- Job Title
- Job Role

In [3]:
# Step 2.1 — Unique distributions
target_cols = ["Job Title", "Job Role"]

for col in target_cols:
    if col not in df_sl.columns:
        continue
    s = df_sl[col].astype(str).str.strip()
    n_unique = s.nunique(dropna=True)
    print(f"\n=== {col} ===")
    print(f"Unique categories: {n_unique}")
    
    vc = s.value_counts(dropna=False)
    print("\nTop 20 frequencies:")
    print(vc.head(20).to_string())
    print(f"\nSingleton categories (appear once): {(vc == 1).sum()}")



=== Job Title ===
Unique categories: 25

Top 20 frequencies:
Job Title
Cloud Infrastructure Engineer    1053
AI Engineer                      1040
Information Security Analyst     1028
System Analyst                   1009
SOC Analyst                      1008
CI/CD Engineer                   1006
Help Desk Technician              989
Technical Support Engineer        986
IT Business Analyst               976
NLP Engineer                      970
Full Stack Engineer               726
Data Visualization Expert         721
Business Intelligence Analyst     693
Android Developer                 686
System Administrator              682
React Developer                   670
Flutter Developer                 659
UI Developer                      658
Frontend Developer                658
iOS Developer                     657

Singleton categories (appear once): 0

=== Job Role ===
Unique categories: 10

Top 20 frequencies:
Job Role
DevOps Engineer              2059
Data Analyst             

🔎 Step 2.2 — Duplicate check (signal columns)

Here, the most important “signal” columns are:
['Job Role', 'Job Title', 'Student Skills']

In [4]:
# Step 2.2 — Duplicates on signal columns
signal_cols_sl = ["Job Role", "Job Title", "Student Skills"]

missing = [c for c in signal_cols_sl if c not in df_sl.columns]
if missing:
    print(f"[WARN] Missing signal columns: {missing}")
else:
    dup_mask = df_sl.duplicated(subset=signal_cols_sl, keep=False)
    num_dup_rows = dup_mask.sum()
    collapsed_rows = len(df_sl) - df_sl.duplicated(subset=signal_cols_sl, keep="first").sum()
    collapsed_pct = round(100 * (len(df_sl) - collapsed_rows) / len(df_sl), 2)
    
    print("\n=== Duplicate Analysis (Job Role, Job Title, Student Skills) ===")
    print(f"Total rows: {len(df_sl)}")
    print(f"Duplicate rows (same triplet): {num_dup_rows}")
    print(f"Rows after dropping duplicates: {collapsed_rows}")
    print(f"Percent rows removed by collapsing: {collapsed_pct}%")


=== Duplicate Analysis (Job Role, Job Title, Student Skills) ===
Total rows: 20000
Duplicate rows (same triplet): 334
Rows after dropping duplicates: 19831
Percent rows removed by collapsing: 0.84%


🔎 Step 2.3 — Text length sanity

Columns worth checking: Student Skills, Job Description.

In [5]:
# Step 2.3 — Text length stats
text_cols_sl = ["Student Skills", "Job Description"]

def text_len_report(frame, cols):
    out = {}
    for col in cols:
        s = frame[col].astype(str).fillna("")
        lens = s.str.len()
        out[col] = {
            "num_rows": len(s),
            "num_missing": frame[col].isna().sum(),
            "num_empty": int((s.eq("") | s.str.strip().eq("")).sum()),
            "min_len": int(lens.min()),
            "max_len": int(lens.max()),
            "mean_len": float(lens.mean()),
            "median_len": float(lens.median()),
            "pct_empty": round(100 * (s.eq("") | s.str.strip().eq("")).mean(), 2),
        }
    return pd.DataFrame(out).T.sort_values("pct_empty", ascending=False)

text_stats_sl = text_len_report(df_sl, text_cols_sl)
print("\n=== Text Column Length Stats (Sri Lanka) ===")
print(text_stats_sl.to_string())


=== Text Column Length Stats (Sri Lanka) ===
                 num_rows  num_missing  num_empty  min_len  max_len  mean_len  median_len  pct_empty
Student Skills    20000.0          0.0        0.0     12.0     58.0  31.75085        32.0        0.0
Job Description   20000.0          0.0        0.0     59.0     77.0  68.89020        70.0        0.0


Saving Cleaned Files

In [7]:
import pandas as pd

signal_cols_sl = ["Job Role", "Job Title", "Student Skills"]

# --- Detect duplicate rows (all members of duplicate groups) ---
dup_mask_all = df_sl.duplicated(subset=signal_cols_sl, keep=False)
df_sl_dups = df_sl.loc[dup_mask_all].copy()

# --- Deduplicate (keep first occurrence) ---
df_sl_dedup = df_sl.drop_duplicates(subset=signal_cols_sl, keep="first").reset_index(drop=True)

print("Original shape:", df_sl.shape)
print("Duplicates shape:", df_sl_dups.shape)
print("Deduped shape:", df_sl_dedup.shape)

# --- Save results ---
output_dir = datasets_folder_path/"prepared"
output_dir.mkdir(parents=True, exist_ok=True)

df_sl_dups.to_csv(datasets_folder_path/"prepared/srilanka_duplicates.csv", index=False)
df_sl_dedup.to_csv(datasets_folder_path/"prepared/srilanka_deduped.csv", index=False)

print("Files saved in 'processed' folder:")
print("- srilanka_duplicates.csv (only duplicate rows)")
print("- srilanka_deduped.csv (original minus duplicates)")


Original shape: (20000, 5)
Duplicates shape: (334, 5)
Deduped shape: (19831, 5)
Files saved in 'processed' folder:
- srilanka_duplicates.csv (only duplicate rows)
- srilanka_deduped.csv (original minus duplicates)
