In [1]:
import pandas as pd

# Load dataset
file_path = "D:/dm_assignment/datasets/error_dataset.csv"
df = pd.read_csv(file_path)

print("Original Dataset:")
print(df)

Original Dataset:
   ID     Name     Age   Salary Department
0   1    Alice      25  50000.0         HR
1   2      Bob  thirty  60000.0    Finance
2   3  Charlie      28      NaN         IT
3   4    David      42  75000.0    Finance
4   5      Eva      35 -45000.0         HR
5   6    Frank      29  65000.0         IT
6   7    Grace      28  62000.0    Finance
7   8     Hank     NaN  58000.0         HR
8   9      Ivy      31  68000.0  Marketing
9  10    Grace      28  62000.0    Finance


In [2]:
# Convert Age to numeric (invalid values become NaN)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Convert Salary to numeric (invalid values become NaN)
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Detect missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Detect negative salaries
print("\nNegative Salaries:")
print(df[df['Salary'] < 0])

# Detect duplicate rows
print("\nDuplicate Rows:")
print(df[df.duplicated()])


Missing Values per Column:
ID            0
Name          0
Age           2
Salary        1
Department    0
dtype: int64

Negative Salaries:
   ID Name   Age   Salary Department
4   5  Eva  35.0 -45000.0         HR

Duplicate Rows:
Empty DataFrame
Columns: [ID, Name, Age, Salary, Department]
Index: []


In [3]:
# Fill missing ages with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Replace negative salaries with absolute values
df['Salary'] = df['Salary'].apply(lambda x: abs(x) if pd.notnull(x) and x < 0 else x)

# Fill missing salaries with median
df['Salary'].fillna(df['Salary'].median(), inplace=True)

# Drop duplicate rows
df = df.drop_duplicates()

print("\nCleaned Dataset:")
print(df)


Cleaned Dataset:
   ID     Name    Age   Salary Department
0   1    Alice  25.00  50000.0         HR
1   2      Bob  30.75  60000.0    Finance
2   3  Charlie  28.00  62000.0         IT
3   4    David  42.00  75000.0    Finance
4   5      Eva  35.00  45000.0         HR
5   6    Frank  29.00  65000.0         IT
6   7    Grace  28.00  62000.0    Finance
7   8     Hank  30.75  58000.0         HR
8   9      Ivy  31.00  68000.0  Marketing
9  10    Grace  28.00  62000.0    Finance


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)
