1. Loading Dataset

In [2]:
import pandas as pd
from pathlib import Path
%run ./hidden.ipynb


# Load your dataset
df_d = pd.read_csv(datasets_folder_path/"raw/dice_com-job_us_sample.csv")
print("Dataset loaded successfully.")

# Display only column names
print("\nColumn list:")
print(list(df_d.columns), "\n")

# Quick preview
print(df_d.head())    

Dataset loaded successfully.

Column list:
['company', 'employmenttype_jobstatus', 'jobdescription', 'joblocation_address', 'jobtitle', 'postdate', 'skills'] 

           company                          employmenttype_jobstatus  \
0  JetBlue Airways                                         Full Time   
1  JetBlue Airways                                         Full Time   
2     Infosoft Inc  Full Time, Contract Independent, C2H Independent   
3     Infosoft Inc  Full Time, Contract Independent, C2H Independent   
4     Calsoft Labs                            Contract W2, 4+ Months   

                                      jobdescription joblocation_address  \
0  ______________________________________________...          Queens, NY   
1  ______________________________________________...          Queens, NY   
2  ______________________________________________...          Gilroy, CA   
3  ______________________________________________...            Any City   
4  _______________Pls share

2. Basic Inspection for Dataset

Step 2.0 Dataframe Shape & Missing Summary

In [3]:
# Shape of the DataFrame
print("Shape (rows, cols):", df_d.shape, "\n")

# Data types of each column
print("Data types:")
print(df_d.dtypes, "\n")

# Missingness summary (count + percent)
missing_summary = (
    df_d.isna()
      .sum()
      .to_frame(name="num_missing")
      .assign(pct_missing=lambda x: (x["num_missing"] / len(df_d)).round(4))
      .sort_values("pct_missing", ascending=False)
)
print("Missingness summary (top 12):")
print(missing_summary.head(12), "\n")

# Fully duplicate rows
print("Fully duplicated rows:", df_d.duplicated().sum())

Shape (rows, cols): (22000, 7) 

Data types:
company                     object
employmenttype_jobstatus    object
jobdescription              object
joblocation_address         object
jobtitle                    object
postdate                    object
skills                      object
dtype: object 

Missingness summary (top 12):
                          num_missing  pct_missing
employmenttype_jobstatus          230       0.0105
company                            50       0.0023
skills                             43       0.0020
joblocation_address                 3       0.0001
jobdescription                      0       0.0000
jobtitle                            0       0.0000
postdate                            0       0.0000 

Fully duplicated rows: 646


Step 2.1 — Unique values distribution (jobdescription, jobtitle, skills)

In [4]:
# STEP 2.1 — Unique value distributions for Dice.com dataset
target_cols = ["jobtitle", "employmenttype_jobstatus", "company"]

for col in target_cols:
    if col not in df_d.columns:
        print(f"[WARN] Column missing: {col}")
        continue
    
    s = df_d[col].astype(str).str.strip()
    n_unique = s.nunique(dropna=True)
    print(f"\n=== {col} ===")
    print(f"Unique categories: {n_unique}")
    
    # Frequency table (top 20)
    vc = s.value_counts(dropna=False)
    print("\nTop 20 frequencies:")
    print(vc.head(20).to_string())
    
    # Singleton categories
    singletons = (vc == 1).sum()
    print(f"\nSingleton categories (appear exactly once): {singletons}")


=== jobtitle ===
Unique categories: 15242

Top 20 frequencies:
jobtitle
Java Developer                                 174
Project Manager                                145
Network Engineer                               128
Software Engineer                              118
Business Analyst                               117
.Net Developer                                  70
DevOps Engineer                                 60
Systems Engineer                                55
Systems Administrator                           54
Web Developer                                   53
Senior Software Engineer                        51
Technical Writer                                49
Business Systems Analyst                        46
Software Developer                              45
Android Developer                               42
Senior Network Engineer                         42
Senior Java Developer                           41
Data Analyst                                    41
Robert Ha

Step 2.2 — Duplicates in “signal columns” (jobdescription, jobtitle, skills)

1. detect duplicate rows where all three (jobdescription, jobtitle, skills) match,
2. show a readable sample of duplicate groups, and
3. remove those duplicates from df_d.

0) Setup (load once)

In [5]:
import pandas as pd

df_d = pd.read_csv(datasets_folder_path / "raw" / "dice_com-job_us_sample.csv", low_memory=False)

signal_cols = ["jobdescription", "jobtitle", "skills"]

# Build normalized keys for duplicate detection (trim only; keep case to avoid over-merging)
key_df = df_d[signal_cols].astype(str).apply(lambda s: s.str.strip())
print(key_df.head())

                                      jobdescription  \
0  ______________________________________________...   
1  ______________________________________________...   
2  ______________________________________________...   
3  ______________________________________________...   
4  _______________Pls share resume at vinod.kumar...   

                                            jobtitle  \
0                                Lead Java Developer   
1                                Lead Java Developer   
2           Staffing - Business Development Managers   
3  Recruiters for US Staffing (Work from Home any...   
4                                    Program Analyst   

                                              skills  
0  Lead, RESTful, JSON, XML, SOAP web service, J2...  
1  Lead, RESTful, JSON, XML, SOAP web service, J2...  
2  Recruiter MSP VMS Staffing Sourcing Sourcer Re...  
3  Recruiter MSP VMS Staffing Sourcing Sourcer Re...  
4  AtTask or MS Excel (macros, charting and pivot..

1) Detect duplicates (counts + groups)

In [6]:
# Mark all members of duplicate groups (not just the second occurrence)
dup_mask_all = key_df.duplicated(keep=False)

total_rows = len(df_d)
dup_rows = int(dup_mask_all.sum())
collapsed_rows = total_rows - (key_df.duplicated(keep="first").sum())
collapsed_pct = round(100 * (total_rows - collapsed_rows) / total_rows, 2)

print("=== Duplicate Analysis (jobdescription, jobtitle, skills) ===")
print(f"Total rows: {total_rows}")
print(f"Duplicate rows (same triplet): {dup_rows}")
print(f"Rows after dropping duplicates: {collapsed_rows}")
print(f"Percent rows removed by collapsing: {collapsed_pct}%")

# All duplicate rows (for audit)
df_dups = df_d.loc[dup_mask_all].copy()
print(f"Total duplicate rows extracted: {len(df_dups)}")
print(df_dups.head(10))

=== Duplicate Analysis (jobdescription, jobtitle, skills) ===
Total rows: 22000
Duplicate rows (same triplet): 2694
Rows after dropping duplicates: 20594
Percent rows removed by collapsing: 6.39%
Total duplicate rows extracted: 2694
                      company                       employmenttype_jobstatus  \
0             JetBlue Airways                                      Full Time   
1             JetBlue Airways                                      Full Time   
20  Capital Markets Placement                           Full Time, Full Time   
21  Capital Markets Placement                           Full Time, Full Time   
37                  MSYS Inc.  Contract Corp-To-Corp, Contract W2, Long Term   
38                  MSYS Inc.  Contract Corp-To-Corp, Contract W2, Long Term   
56         TechLink Resources                          Contract W2, contract   
57         TechLink Resources                          Contract W2, contract   
64           Adroit Resources             Contr

2) Display a clean, readable sample of duplicate groups (no row inflation)

In [7]:
# Helper to truncate long text
def trunc(s, n=120):
    s = "" if pd.isna(s) else str(s)
    s = " ".join(s.split())  # collapse whitespace
    return (s[: n-1] + "…") if len(s) > n else s

# Which duplicate groups (by the triplet) are largest?
group_sizes = (
    df_dups.groupby(signal_cols, dropna=False)
           .size()
           .sort_values(ascending=False)
)
K = 5   # show top K duplicate groups
M = 3   # show up to M rows per group
top_keys = group_sizes.head(K).index

# Build a sample from those groups only (max M rows per group)
sample_list = []
for k in top_keys:
    cond = (key_df[signal_cols[0]].eq(k[0]) &
            key_df[signal_cols[1]].eq(k[1]) &
            key_df[signal_cols[2]].eq(k[2]))
    grp_rows = df_d.loc[cond].head(M).copy()
    sample_list.append(grp_rows)

sample = pd.concat(sample_list, ignore_index=True) if sample_list else pd.DataFrame(columns=df_d.columns)

# Tidy/truncate for display
for col in ["jobdescription", "skills"]:
    if col in sample.columns:
        sample[col] = sample[col].map(lambda x: trunc(x, 120))

display_cols = [c for c in [
    "jobtitle", "skills", "company", "employmenttype_jobstatus",
    "joblocation_address", "postdate", "jobdescription"
] if c in sample.columns]

print("=== Sample duplicate groups (formatted, truncated) ===\n")
print(sample[display_cols].to_string(index=False))
# How many times does each duplicate triplet occur?
dup_counts = (
    df_dups.groupby(signal_cols, dropna=False)
              .size()
                .reset_index(name="count")
                .sort_values("count", ascending=False)
                .reset_index(drop=True)
                .head(10)
)
dup_counts["jobdescription"] = dup_counts["jobdescription"].map(lambda x: trunc(x, 80))
dup_counts["skills"] = dup_counts["skills"].map(lambda x: trunc(x, 80))
dup_counts["jobtitle"] = dup_counts["jobtitle"].map(lambda x: trunc(x, 40))
dup_counts = dup_counts[["count"] + signal_cols]
# Show top 10 duplicate triplets by frequency
print("\n=== Duplicate triplet counts ===")
print(dup_counts.head(10))

=== Sample duplicate groups (formatted, truncated) ===

                                 jobtitle                                                                                                                   skills                                  company                                                                                                        employmenttype_jobstatus joblocation_address    postdate                                                                                                           jobdescription
                     SAP Ariba Consultant                        SAP Ariba Consultant, SAP Ariba Consulting, SAP Ariba Implementation, Senior SAP Ariba Consultant                               CCP Global Full Time, Contract Corp-To-Corp, Contract Independent, Contract W2, C2H Corp-To-Corp, C2H Independent, C2H W2, Part Time, Perm       Vancouver, BC  4 days ago Join the CCP Global team and experience the difference an innovative boutique Ariba consulting 

3) Remove duplicates (keep the first occurrence), no row increase

In [8]:
# Create a deduplicated dataframe (does not modify df_d unless you assign back)
df_d_dedup = df_d.drop_duplicates(subset=signal_cols, keep="first").reset_index(drop=True)

print("\n=== Deduped shape ===")
print(df_d_dedup.shape)


=== Deduped shape ===
(20598, 7)


Saving cleaned files    

In [12]:
import pandas as pd

# Load dataset
df_d = pd.read_csv(datasets_folder_path / "raw/dice_com-job_us_sample.csv", low_memory=False)

signal_cols = ["jobdescription", "jobtitle", "skills"]

# --- Detect duplicate rows ---
# All members of duplicate groups
dup_mask_all = df_d.duplicated(subset=signal_cols, keep=False)
df_dups = df_d.loc[dup_mask_all].copy()

# Deduplicated dataframe (keep first occurrence)
df_d_dedup = df_d.drop_duplicates(subset=signal_cols, keep="first").reset_index(drop=True)

print("Original shape:", df_d.shape)
print("Duplicates shape:", df_dups.shape)
print("Deduped shape:", df_d_dedup.shape)

# --- Save to files ---
df_dups.to_csv(datasets_folder_path/"prepared/dice_com_duplicates.csv", index=False)
df_d_dedup.to_csv(datasets_folder_path/"prepared/dice_com_deduped.csv", index=False)

print("Files saved in 'prepared' folder:")
print("- dice_com_duplicates.csv (only duplicate rows)")
print("- dice_com_deduped.csv (original minus duplicates)")

Original shape: (22000, 7)
Duplicates shape: (2686, 7)
Deduped shape: (20598, 7)
Files saved in 'prepared' folder:
- dice_com_duplicates.csv (only duplicate rows)
- dice_com_deduped.csv (original minus duplicates)


🔎 Step 2.3 — Textual column length sanity checks

Core text fields here: skills, jobdescription.

In [10]:
# STEP 2.3 — Text column length stats for Dice.com dataset
text_cols = ["skills", "jobdescription"]

def text_len_report(frame, cols):
    out = {}
    for col in cols:
        s = frame[col].astype(str).fillna("")
        lens = s.str.len()
        out[col] = {
            "num_rows": len(s),
            "num_missing": frame[col].isna().sum(),
            "num_empty": int((s.eq("") | s.str.strip().eq("")).sum()),
            "min_len": int(lens.min()),
            "max_len": int(lens.max()),
            "mean_len": float(lens.mean()),
            "median_len": float(lens.median()),
            "pct_empty": round(100 * (s.eq("") | s.str.strip().eq("")).mean(), 2),
        }
    return pd.DataFrame(out).T.sort_values("pct_empty", ascending=False)

text_stats_d = text_len_report(df_d, text_cols)
print("=== Text Column Length Stats (Dice.com) ===")
print(text_stats_d.to_string())


=== Text Column Length Stats (Dice.com) ===
                num_rows  num_missing  num_empty  min_len  max_len     mean_len  median_len  pct_empty
skills           22000.0         43.0        0.0      1.0   3544.0    74.692136        55.0        0.0
jobdescription   22000.0          0.0        0.0      4.0  24382.0  2295.370864      2012.5        0.0


🔎 Step 2.4 — Numeric sanity check (salary not present)

In [11]:
# STEP 2.4 — Numeric salary check (Dice.com)
if "Salary_Numeric" in df_d.columns:
    print("Salary_Numeric exists, run numeric stats like Morocco.")
else:
    print("No Salary_Numeric column in Dice.com dataset → skipping salary stats.")

No Salary_Numeric column in Dice.com dataset → skipping salary stats.
