# Data Inspection

- Exploring missing values
- Making sure the columns and datatypes are correct


In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv("bias_detection_in_hiring.csv")

# View the first few rows
print(data.head())

# Get a summary of the data
print(data.info())

# Check for missing values
print(data.isnull().sum())


   S.No  Age Accessibility        EdLevel  Employment Gender MentalHealth  \
0     0  <35            No         Master           1    Man           No   
1     1  <35            No  Undergraduate           1    Man           No   
2     2  <35            No         Master           1    Man           No   
3     3  <35            No  Undergraduate           1    Man           No   
4     4  >35            No            PhD           0    Man           No   

  MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0        Dev          7             4     Sweden         51552.0   
1        Dev         12             5      Spain         46482.0   
2        Dev         15             6    Germany         77290.0   
3        Dev          9             6     Canada         46135.0   
4     NotDev         40            30  Singapore        160932.0   

                                              skills  ComputerSkills  Employed  
0                          C++;Python;Git;Postg

The 'Age' column contains value ['<35', '>35'] so to solve this,
we'll replace those with median values and since we don't have enough information regarding this
We'll assume <35 to be 30 and >35 to be 40

In [2]:
# age columns are a problem so fixing that

# Check unique values in the Age column
print(data['Age'].unique())

# Replace '<35' with 34 and '>35' with 36
data['Age'] = data['Age'].replace({'<35': '30', '>35': '40'})


print(data['Age'].unique())

# converting to numeric
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# fill missing values with median age
data['Age'] = data['Age'].fillna(data['Age'].median())

print(data.head())

['<35' '>35']
['30' '40']
   S.No  Age Accessibility        EdLevel  Employment Gender MentalHealth  \
0     0   30            No         Master           1    Man           No   
1     1   30            No  Undergraduate           1    Man           No   
2     2   30            No         Master           1    Man           No   
3     3   30            No  Undergraduate           1    Man           No   
4     4   40            No            PhD           0    Man           No   

  MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0        Dev          7             4     Sweden         51552.0   
1        Dev         12             5      Spain         46482.0   
2        Dev         15             6    Germany         77290.0   
3        Dev          9             6     Canada         46135.0   
4     NotDev         40            30  Singapore        160932.0   

                                              skills  ComputerSkills  Employed  
0                    

Cleaning object columns - converting datatype 'man', 'woman' to m and f for male and female and n for non binary


In [3]:
# Check unique values
print(data['Gender'].unique())

# Replace inconsistent entries (e.g., 'male', 'Male', 'M' -> 'Male')
data['Gender'] = data['Gender'].str.lower().replace({
    'man': 'm',
    'woman': 'f',
    'nonbinary': 'n'
})

print(data.head())


['Man' 'Woman' 'NonBinary']
   S.No  Age Accessibility        EdLevel  Employment Gender MentalHealth  \
0     0   30            No         Master           1      m           No   
1     1   30            No  Undergraduate           1      m           No   
2     2   30            No         Master           1      m           No   
3     3   30            No  Undergraduate           1      m           No   
4     4   40            No            PhD           0      m           No   

  MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0        Dev          7             4     Sweden         51552.0   
1        Dev         12             5      Spain         46482.0   
2        Dev         15             6    Germany         77290.0   
3        Dev          9             6     Canada         46135.0   
4     NotDev         40            30  Singapore        160932.0   

                                              skills  ComputerSkills  Employed  
0                  

The 'skills' column contains alot of values separated by ';' so we'll fix those

In [4]:
# Split the skills column
def split_skills(skills_string):
    if pd.isna(skills_string):
        return []
    return [skill.strip() for skill in skills_string.split(';')]

# Create a new column with a list of skills
data['skills_list'] = data['skills'].apply(split_skills)

# Get unique skills across the entire dataset
all_skills = set()
for skills in data['skills_list']:
    all_skills.update(skills)

# Prepare a dictionary to store skill columns
skill_columns = {}

# Create dummy columns for each unique skill more efficiently
for skill in sorted(all_skills):
    # Replace any problematic characters in column name
    safe_skill = skill.replace('.', '_').replace('/', '_')
    skill_columns[f'skill_{safe_skill}'] = data['skills_list'].apply(lambda x: skill in x)

# Add skill columns all at once using pd.concat
skill_data = pd.DataFrame(skill_columns)
data = pd.concat([data, skill_data], axis=1)

# Print out some useful information
print(f"Total unique skills found: {len(all_skills)}")
print("Skills list:")
for skill in sorted(all_skills):
    print(f"- {skill}")

Total unique skills found: 116
Skills list:
- APL
- ASP.NET
- ASP.NET Core
- AWS
- Angular
- Angular.js
- Ansible
- Assembly
- Bash/Shell
- Blazor
- C
- C#
- C++
- COBOL
- Cassandra
- Chef
- Clojure
- Cloud Firestore
- Colocation
- CouchDB
- Couchbase
- Crystal
- Dart
- Delphi
- Deno
- DigitalOcean
- Django
- Docker
- Drupal
- DynamoDB
- Elasticsearch
- Elixir
- Erlang
- Express
- F#
- FastAPI
- Fastify
- Firebase
- Firebase Realtime Database
- Flask
- Flow
- Fortran
- Gatsby
- Git
- Go
- Google Cloud
- Google Cloud Platform
- Groovy
- HTML/CSS
- Haskell
- Heroku
- Homebrew
- IBM Cloud or Watson
- IBM DB2
- Java
- JavaScript
- Julia
- Kotlin
- Kubernetes
- LISP
- Laravel
- Linode
- Lua
- MATLAB
- Managed Hosting
- MariaDB
- Matlab
- Microsoft Azure
- Microsoft SQL Server
- MongoDB
- MySQL
- Neo4j
- Next.js
- Node.js
- Nuxt.js
- OCaml
- OVH
- Objective-C
- OpenStack
- Oracle
- Oracle Cloud Infrastructure
- PHP
- Perl
- Phoenix
- Play Framework
- PostgreSQL
- PowerShell
- Pulumi
- Puppet

Since we now have a skills_list column - we will drop the 'skills'

In [6]:
data = data.drop(columns=['skills'])

In [7]:
data.to_csv("cleaned_dataset.csv", index=False)
