In [1]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    'ID': [1, 2, 2, 3, 4, 5, 6, 7, 8, 9],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eve', None, 'George', 'Hannah', 'Ian'],
    'Age': [25, 30, 30, np.nan, 45, 50, 22, 23, 120, -5],  # Outliers and missing value
    'Salary': [50000, 60000, 60000, 70000, None, 90000, 40000, 30000, 20000, 100000],  # Missing value
    'JoinDate': ['2022-01-15', '2021-06-20', '2021-06-20', '2020-03-10', '2019-12-05', None, '2023-05-18', '2022-07-21', '2021-09-30', '2018-11-11']
}

# Create a DataFrame
df = pd.DataFrame(data)

# 1. Handling Missing Values
df['Age'].fillna(df['Age'].median(), inplace=True)  # Replace NaN in Age with median
df['Salary'].fillna(df['Salary'].mean(), inplace=True)  # Replace NaN in Salary with mean
df['Name'].fillna('Unknown', inplace=True)  # Replace missing names with 'Unknown'

# 2. Removing Duplicates
df.drop_duplicates(inplace=True)

# 3. Converting Data Types
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')  # Convert to datetime

# 4. Handling Outliers in Age (Assuming valid age range is 18-100)
df.loc[df['Age'] > 100, 'Age'] = df['Age'].median()
df.loc[df['Age'] < 18, 'Age'] = df['Age'].median()

# 5. Removing Negative or Invalid Salary
df = df[df['Salary'] > 0]

# Display cleaned data
print(df)

   ID     Name   Age         Salary   JoinDate
0   1    Alice  25.0   50000.000000 2022-01-15
1   2      Bob  30.0   60000.000000 2021-06-20
3   3  Charlie  30.0   70000.000000 2020-03-10
4   4    David  45.0   57777.777778 2019-12-05
5   5      Eve  50.0   90000.000000        NaT
6   6  Unknown  22.0   40000.000000 2023-05-18
7   7   George  23.0   30000.000000 2022-07-21
8   8   Hannah  30.0   20000.000000 2021-09-30
9   9      Ian  30.0  100000.000000 2018-11-11


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)  # Replace NaN in Age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)  # Replace NaN in Salary with mean
The behavior will change in pandas 3.0. This inplace meth

Error:

> For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


In [2]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    'ID': [1, 2, 2, 3, 4, 5, 6, 7, 8, 9],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eve', None, 'George', 'Hannah', 'Ian'],
    'Age': [25, 30, 30, np.nan, 45, 50, 22, 23, 120, -5],  # Outliers and missing value
    'Salary': [50000, 60000, 60000, 70000, None, 90000, 40000, 30000, 20000, 100000],  # Missing value
    'JoinDate': ['2022-01-15', '2021-06-20', '2021-06-20', '2020-03-10', '2019-12-05', None, '2023-05-18', '2022-07-21', '2021-09-30', '2018-11-11']
}

# Create a DataFrame
df = pd.DataFrame(data)

# 1. Handling Missing Values
df.fillna({'Age': df['Age'].median()}, inplace=True)  # Replace NaN in Age with median
df.fillna({'Salary': df['Salary'].mean()}, inplace=True)  # Replace NaN in Salary with mean
df.fillna({'Name': 'Unknown'}, inplace=True)  # Replace missing names with 'Unknown'

# 2. Removing Duplicates
df.drop_duplicates(inplace=True)

# 3. Converting Data Types
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce')  # Convert to datetime

# 4. Handling Outliers in Age (Assuming valid age range is 18-100)
df.loc[df['Age'] > 100, 'Age'] = df['Age'].median()
df.loc[df['Age'] < 18, 'Age'] = df['Age'].median()

# 5. Removing Negative or Invalid Salary
df = df[df['Salary'] > 0]

# Display cleaned data
print(df)

   ID     Name   Age         Salary   JoinDate
0   1    Alice  25.0   50000.000000 2022-01-15
1   2      Bob  30.0   60000.000000 2021-06-20
3   3  Charlie  30.0   70000.000000 2020-03-10
4   4    David  45.0   57777.777778 2019-12-05
5   5      Eve  50.0   90000.000000        NaT
6   6  Unknown  22.0   40000.000000 2023-05-18
7   7   George  23.0   30000.000000 2022-07-21
8   8   Hannah  30.0   20000.000000 2021-09-30
9   9      Ian  30.0  100000.000000 2018-11-11


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)  # Replace missing names with 'Unknown'
