#Data Cleaning in Pandas

In [1]:
import pandas as pd

**Loading the data in a variable.**

In [2]:
df = pd.read_csv('day1_data.csv')
df

Unnamed: 0,Name,Age,Salary,Department
0,John,25.0,50000.0,IT
1,,30.0,60000.0,HR
2,Alice,21.0,-45000.0,Finance
3,Bob,28.0,,IT
4,Eve,,55000.0,HR
5,Charlie,35.0,65000.0,Finance
6,Alice,31.0,30000.0,IT


**Defining Functions**

In [10]:
from pandas.api.types import is_numeric_dtype
def convert_numeric(df,cols):
  for col in cols:
    df[col] = pd.to_numeric(df[col],errors='coerce')
  return df



def fill_missing(df):
    for col in df.columns:
        if is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].mean())   # numeric → mean
        else:
            df[col] = df[col].fillna(df[col].mode()[0])  # categorical → mode
    return df


def remove_invalid_salary(df):
  df = df[df['Salary'] > 0]
  return df


**Functional Pipeline**

In [11]:
df_cleaned = (df
              .pipe(convert_numeric,["Age","Salary"])
              .pipe(fill_missing)
              .pipe(remove_invalid_salary)
             )
df_cleaned

Unnamed: 0,Name,Age,Salary,Department
0,John,25.0,50000.0,IT
1,Alice,30.0,60000.0,HR
3,Bob,28.0,35833.333333,IT
4,Eve,28.333333,55000.0,HR
5,Charlie,35.0,65000.0,Finance
6,Alice,31.0,30000.0,IT
