In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('datasets/bias_detection_in_hiring.csv')

# Data Cleaning Steps:

# 1. Handle Age Column
def process_age(age):
    if age == '<35':
        return 1  # Under 35
    elif age == '>35':
        return 2  # Over 35
    else:
        return np.nan

df['Age_Category'] = df['Age'].apply(process_age)

# 2. Process Employment Column
df['Is_Employed'] = df['Employed'].map({0: 'No', 1: 'Yes'})

# 3. Split and Process Skills Columns
def process_skills(skills_string):
    if pd.isna(skills_string):
        return []
    return skills_string.split(';')

df['Skills_List'] = df['HaveWorkedWith'].apply(process_skills)

# 4. Count Number of Skills
df['Skills_Count'] = df['Skills_List'].apply(len)

# 5. Create Categorical Columns
# Education Level
df['Education_Level'] = pd.Categorical(df['EdLevel'])

# Gender
df['Gender_Category'] = pd.Categorical(df['Gender'])

# 6. Convert Salary to Numeric
df['Previous_Salary'] = pd.to_numeric(df['PreviousSalary'], errors='coerce')

# 7. Years of Coding Experience Cleaning
df['Years_Coding'] = pd.to_numeric(df['YearsCode'], errors='coerce')
df['Years_Professional_Coding'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')

# 8. One-Hot Encode Categorical Variables
categorical_columns = ['Accessibility', 'MentalHealth', 'MainBranch']
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# 9. Summarize Skills
def most_common_skills(skills_list):
    # Flatten the list of skills
    all_skills = [skill for sublist in skills_list for skill in sublist]
    # Count and return top 10
    return pd.Series(all_skills).value_counts().head(10)

top_skills = most_common_skills(df['Skills_List'])

# 10. Basic Descriptive Statistics
summary_stats = {
    'Total_Records': len(df),
    'Average_Salary': df['Previous_Salary'].mean(),
    'Average_Skills_Count': df['Skills_Count'].mean(),
    'Employed_Percentage': df['Is_Employed'].value_counts(normalize=True)['Yes'] * 100,
    'Top_Skills': dict(top_skills)
}

# Print Summary
print("Dataset Cleaning Summary:")
for key, value in summary_stats.items():
    print(f"{key}: {value}")

# Optional: Save cleaned dataset
df.to_csv('datasets/cleaned_dataset.csv', index=False)

Dataset Cleaning Summary:
Total_Records: 73462
Average_Salary: 67750.2606109281
Average_Skills_Count: 13.428221393373445
Employed_Percentage: 53.62228090713567
Top_Skills: {'JavaScript': np.int64(49347), 'Docker': np.int64(40224), 'HTML/CSS': np.int64(40207), 'SQL': np.int64(38361), 'Git': np.int64(35911), 'AWS': np.int64(31590), 'Python': np.int64(31159), 'PostgreSQL': np.int64(30102), 'MySQL': np.int64(29490), 'TypeScript': np.int64(27533)}
