# Data Inspection

- Exploring missing values
- Making sure the columns and datatypes are correct


In [10]:
import pandas as pd

# Load the dataset
data = pd.read_csv("bias_detection_in_hiring.csv")

# View the first few rows
print(data.head())

# Get a summary of the data
print(data.info())

# Check for missing values
print(data.isnull().sum())


   S.No  Age Accessibility        EdLevel  Employment Gender MentalHealth  \
0     0  <35            No         Master           1    Man           No   
1     1  <35            No  Undergraduate           1    Man           No   
2     2  <35            No         Master           1    Man           No   
3     3  <35            No  Undergraduate           1    Man           No   
4     4  >35            No            PhD           0    Man           No   

  MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0        Dev          7             4     Sweden         51552.0   
1        Dev         12             5      Spain         46482.0   
2        Dev         15             6    Germany         77290.0   
3        Dev          9             6     Canada         46135.0   
4     NotDev         40            30  Singapore        160932.0   

                                      HaveWorkedWith  ComputerSkills  Employed  
0                          C++;Python;Git;Postg

In [6]:
# Fill missing numerical values with median
data['Age'] = data['Age'].fillna(data['Age'].median())
data['yearsofexperience'] = data['yearsofexperience'].fillna(data['yearsofexperience'].median())

# Fill missing categorical values with mode
data['sex'] = data['sex'].fillna(data['sex'].mode()[0])
data['skills'] = data['skills'].fillna(data['skills'].mode()[0])


TypeError: Cannot convert ['<35' '<35' '<35' ... '<35' '<35' '<35'] to numeric

The 'Age' column contains value ['<35', '>35'] so to solve this,
we'll replace those with median values and since we don't have enough information regarding this
We'll assume <35 to be 30 and >35 to be 40

In [16]:
# age columns are a problem so fixing that

# Check unique values in the Age column
print(data['Age'].unique())

# Replace '<35' with 34 and '>35' with 36
data['Age'] = data['Age'].replace({'<35': '30', '>35': '40'})


print(data['Age'].unique())

# converting to numeric
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# fill missing values with median age
data['Age'] = data['Age'].fillna(data['Age'].median())

print(data.head())

[30 40]
[30 40]
   S.No  Age Accessibility        EdLevel  Employment Gender MentalHealth  \
0     0   30            No         Master           1    Man           No   
1     1   30            No  Undergraduate           1    Man           No   
2     2   30            No         Master           1    Man           No   
3     3   30            No  Undergraduate           1    Man           No   
4     4   40            No            PhD           0    Man           No   

  MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0        Dev          7             4     Sweden         51552.0   
1        Dev         12             5      Spain         46482.0   
2        Dev         15             6    Germany         77290.0   
3        Dev          9             6     Canada         46135.0   
4     NotDev         40            30  Singapore        160932.0   

                                      HaveWorkedWith  ComputerSkills  Employed  
0                          C++;

Cleaning object columns - converting datatype 'man', 'woman' to m and f for male and female and n for non binary


In [20]:
# Check unique values
print(data['Gender'].unique())

# Replace inconsistent entries (e.g., 'male', 'Male', 'M' -> 'Male')
data['Gender'] = data['Gender'].str.lower().replace({
    'man': 'm',
    'woman': 'f',
    'nonbinary': 'n'
})

print(data.head())


['man' 'woman' 'nonbinary']
   S.No  Age Accessibility        EdLevel  Employment Gender MentalHealth  \
0     0   30            No         Master           1      m           No   
1     1   30            No  Undergraduate           1      m           No   
2     2   30            No         Master           1      m           No   
3     3   30            No  Undergraduate           1      m           No   
4     4   40            No            PhD           0      m           No   

  MainBranch  YearsCode  YearsCodePro    Country  PreviousSalary  \
0        Dev          7             4     Sweden         51552.0   
1        Dev         12             5      Spain         46482.0   
2        Dev         15             6    Germany         77290.0   
3        Dev          9             6     Canada         46135.0   
4     NotDev         40            30  Singapore        160932.0   

                                      HaveWorkedWith  ComputerSkills  Employed  
0                  

In [21]:
data.to_csv("cleaned_dataset.csv", index=False)
