In [1]:
# Title: Data Cleaning using Pandas
# Description: Check for missing values and handle them by imputing the median.

In [2]:
import seaborn as sns
import pandas as pd

# Load the Iris dataset (which has no missing values for demonstration purposes)
iris = sns.load_dataset('iris')

# Introduce some artificial missing values for demonstration
import numpy as np
np.random.seed(42)
indices = np.random.choice(iris.index, size=20, replace=False)
iris.loc[indices, 'sepal_length'] = np.nan
indices = np.random.choice(iris.index, size=15, replace=False)
iris.loc[indices, 'sepal_width'] = np.nan

# Check for missing values
print("Number of missing values before imputation:")
print(iris.isnull().sum())
print("\n" + "="*50 + "\n")

# Impute missing values in 'sepal_length' with the median of that column
median_sepal_length = iris['sepal_length'].median()
iris['sepal_length'].fillna(median_sepal_length, inplace=True)

# Impute missing values in 'sepal_width' with the median of that column
median_sepal_width = iris['sepal_width'].median()
iris['sepal_width'].fillna(median_sepal_width, inplace=True)

# Verify that missing values have been handled
print("Number of missing values after imputation:")
print(iris.isnull().sum())
print("\n" + "="*50 + "\n")

# Display the first few rows after imputation
print("First few rows of the Iris dataset after imputation:")
print(iris.head())

Number of missing values before imputation:
sepal_length    20
sepal_width     15
petal_length     0
petal_width      0
species          0
dtype: int64


Number of missing values after imputation:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


First few rows of the Iris dataset after imputation:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris['sepal_length'].fillna(median_sepal_length, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris['sepal_width'].fillna(median_sepal_width, inplace=True)
