In [None]:
# handling missing values
# data analysis with python
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv('animal_dataset.csv') #upload the csv file and then read
df

In [None]:
# checking for missing values
# give the sum of values based on each column
print(df.isnull().sum())

In [None]:
# dropping 'rows' with missing values
df.dropna(axis=0)

In [None]:
# dropping 'columns' with missing values
df.dropna(axis=1)

In [None]:
# identify columns that have exactly one missing value
drop_columns = df.columns[df.isnull().sum()==1]
df_cleaned = df.drop(columns = drop_columns)
df_cleaned.head()

Imputation : Filling Missing Data Pandas
- Instead of dropping missing values, fill it

In [None]:
def fill_missing_values_pandas(df):

    df1 = df.copy() 

    # fill missing values with mean for numerical 'Age' column
    df1['Age'] = df1['Age'].fillna(df1['Age'].mean())

    # fill missing values with median for numerical 'Weight_kg' column
    df1['Weight_kg'] = df1['Weight_kg'].fillna(df1['Weight_kg'].median())

    # fill missing values with mode for numerical 'Habitat' column
    df1['Habitat'] = df1['Habitat'].fillna(df1['Habitat'].mode()[0])

    # fill missing values with True for numerical 'Endangered' column
    df1['Endangered'] = df1['Endangered'].fillna(value='True')
    return df1

fill_missing_values_pandas(df)



In [None]:
def fill_missinf_values_with_sklearn(df):
    df2 = df.copy()

    # create imputers for different strategies
    mean_imputer = SimpleImputer(strategy='mean')
    median_imputer = SimpleImputer(strategy='median')
    mode_imputer = SimpleImputer(strategy='most_frequent')
    constant_imputer = SimpleImputer(strategy='constant', fill_value=True)

    # fill missing values using the appropriate imputer
    df2[['Age']] = mean_imputer.fit_transform(df[['Age']])
    df2[['Weight_kg']] = median_imputer.fit_transform(df[['Weight_kg']])
    df2[['Habitat']] = mode_imputer.fit_transform(df[['Habitat']])
    df2[['Endangered']] = constant_imputer.fit_transform(df[['Endangered']])
    return df2
fill_missing_values_with_sklearn(df)

Why Use .copy()?

- Avoid Unintended Changes: If you want to experiment with or modify the data without altering the original DataFrame, using .copy() ensures that the original data remains unchanged.

- Data Integrity: It helps maintain the integrity of your original data, especially when performing operations that might be destructive or irreversible.