In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
# Sample dataset with missing values
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [25, np.nan, 30, 35, np.nan],
    "Salary": [50000, 60000, 75000, np.nan, 90000],
    "City": ["New York", "Los Angeles", "New York", "Chicago", "Los Angeles"]
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,,60000.0,Los Angeles
2,Charlie,30.0,75000.0,New York
3,David,35.0,,Chicago
4,Eve,,90000.0,Los Angeles


# 1️⃣ Handling Missing Data

In [4]:
df["Age"].fillna(df["Age"].mean(), inplace=True)  # Replace NaN with mean
df["Salary"].fillna(df["Salary"].median(), inplace=True)  # Replace NaN with median


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)  # Replace NaN with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Salary"].fillna(df["Salary"].median(), inplace=True)  # Replace NaN with median


# 2️⃣ Removing Duplicates (if any)

In [5]:
df.drop_duplicates(inplace=True)

# 3️⃣ Feature Scaling

In [6]:
scaler = MinMaxScaler()

df[["Age", "Salary"]] = scaler.fit_transform(df[["Age", "Salary"]])

In [7]:
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,0.0,0.0,New York
1,Bob,0.5,0.25,Los Angeles
2,Charlie,0.5,0.625,New York
3,David,1.0,0.4375,Chicago
4,Eve,0.5,1.0,Los Angeles


# 4️⃣ Encoding Categorical Data

In [9]:
encoder = LabelEncoder()

df["City"] = encoder.fit_transform(df["City"]) 

In [10]:
print("Processed Data:\n", df)

Processed Data:
       Name  Age  Salary  City
0    Alice  0.0  0.0000     2
1      Bob  0.5  0.2500     1
2  Charlie  0.5  0.6250     2
3    David  1.0  0.4375     0
4      Eve  0.5  1.0000     1
