In [1]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.



In [2]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {'category': ['A', 'B', 'A', 'B', 'A', 'B'],
        'value': [10, np.nan, 20, np.nan, 30, np.nan],
        'indicator': [True, False, True, False, False, True]}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Task: Fill missing 'value' with 0 if the corresponding 'indicator' is False

df['value'] = np.where(df['indicator'] == False, df['value'].fillna(0), df['value'])

print("\nDataFrame after conditional filling:")
print(df)

Original DataFrame:
  category  value  indicator
0        A   10.0       True
1        B    NaN      False
2        A   20.0       True
3        B    NaN      False
4        A   30.0      False
5        B    NaN       True

DataFrame after conditional filling:
  category  value  indicator
0        A   10.0       True
1        B    0.0      False
2        A   20.0       True
3        B    0.0      False
4        A   30.0      False
5        B    NaN       True


In [3]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.



In [4]:
import pandas as pd
import numpy as np
from scipy import stats

# Sample DataFrame with potential outliers
data = {'value': [20, 22, 25, 23, 21, 150, 24, 26, 22, -10]}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# 1. Calculate Z-scores
df['z_score'] = np.abs(stats.zscore(df['value']))

# 2. Identify Outliers based on a threshold (e.g., |z-score| > 3)
threshold = 2  # You can adjust this threshold
outliers = df[df['z_score'] > threshold]['value'].tolist()
print(f"\nOutliers (based on |z-score| > {threshold}): {outliers}")

# 3. Rescale the column (e.g., using standardization again, but now aware of outliers)
# One common approach is to winsorize or cap the outliers instead of direct removal for rescaling.
# However, the question asks for removal by rescaling using z-scores, so we'll demonstrate
# a way to create a new column where outliers have less extreme z-scores.

# Option 1: Create a new column with original values, but mark outliers
df['is_outlier'] = df['z_score'] > threshold

# Option 2: Replace outlier values with a boundary (winsorizing at the threshold)
df_winsorized = df.copy()
upper_bound = df['value'].mean() + threshold * df['value'].std()
lower_bound = df['value'].mean() - threshold * df['value'].std()
df_winsorized.loc[df_winsorized['value'] > upper_bound, 'value_winsorized'] = upper_bound
df_winsorized.loc[df_winsorized['value'] < lower_bound, 'value_winsorized'] = lower_bound
df_winsorized['z_score_winsorized'] = np.abs(stats.zscore(df_winsorized['value_winsorized']))

print("\nDataFrame with Z-scores and Outlier Identification:")
print(df)

print("\nDataFrame with Winsorized Values and their Z-scores:")
print(df_winsorized[['value', 'value_winsorized', 'z_score', 'z_score_winsorized']])

# Note: Directly "removing by rescaling using z-scores" is a bit ambiguous.
# Z-scores help identify outliers. To reduce their impact through rescaling,
# we often use techniques like winsorizing or robust scaling (e.g., using median and IQR).

# Demonstration of Robust Scaling (another way to reduce outlier impact):
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
df['value_robust_scaled'] = scaler.fit_transform(df[['value']])

print("\nDataFrame with Robust Scaled Values:")
print(df[['value', 'z_score', 'value_robust_scaled']])

Original DataFrame:
   value
0     20
1     22
2     25
3     23
4     21
5    150
6     24
7     26
8     22
9    -10

Outliers (based on |z-score| > 2): [150]

DataFrame with Z-scores and Outlier Identification:
   value   z_score  is_outlier
0     20  0.303893       False
1     22  0.254479       False
2     25  0.180359       False
3     23  0.229773       False
4     21  0.279186       False
5    150  2.907982        True
6     24  0.205066       False
7     26  0.155652       False
8     22  0.254479       False
9    -10  1.045095       False

DataFrame with Winsorized Values and their Z-scores:
   value  value_winsorized   z_score  z_score_winsorized
0     20               NaN  0.303893                 NaN
1     22               NaN  0.254479                 NaN
2     25               NaN  0.180359                 NaN
3     23               NaN  0.229773                 NaN
4     21               NaN  0.279186                 NaN
5    150        117.628385  2.907982             

In [5]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.



In [6]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing age values and incorrect data type
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Age': ['25.5', np.nan, '30', '40.0', '22'],
        'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney']}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print(f"\nData type of 'Age' column: {df['Age'].dtype}")

# 1. Fill Missing Values in 'Age'
# For demonstration, let's fill missing 'Age' with the mean (you might use a different strategy)
df['Age_float'] = pd.to_numeric(df['Age'], errors='coerce') # Convert to numeric first to calculate mean
mean_age = df['Age_float'].mean()
df['Age'].fillna(mean_age, inplace=True)

print("\nDataFrame after filling missing 'Age' values:")
print(df)
print(f"\nData type of 'Age' column after filling: {df['Age'].dtype}")

# 2. Convert 'Age' column to integers
# First, ensure the 'Age' column is of a numeric type (float or int)
df['Age'] = pd.to_numeric(df['Age'], errors='raise') # Raise error if conversion fails

# Now, convert to integer
df['Age'] = df['Age'].astype(int)

print("\nDataFrame after converting 'Age' column to integers:")
print(df)
print(f"\nData type of 'Age' column after conversion: {df['Age'].dtype}")

Original DataFrame:
      Name   Age      City
0    Alice  25.5  New York
1      Bob   NaN    London
2  Charlie    30     Paris
3    David  40.0     Tokyo
4      Eve    22    Sydney

Data type of 'Age' column: object

DataFrame after filling missing 'Age' values:
      Name     Age      City  Age_float
0    Alice    25.5  New York       25.5
1      Bob  29.375    London        NaN
2  Charlie      30     Paris       30.0
3    David    40.0     Tokyo       40.0
4      Eve      22    Sydney       22.0

Data type of 'Age' column after filling: object

DataFrame after converting 'Age' column to integers:
      Name  Age      City  Age_float
0    Alice   25  New York       25.5
1      Bob   29    London        NaN
2  Charlie   30     Paris       30.0
3    David   40     Tokyo       40.0
4      Eve   22    Sydney       22.0

Data type of 'Age' column after conversion: int64


In [7]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.



In [8]:
import pandas as pd
import numpy as np

def automate_data_cleaning(df, missing_value_strategy=None, columns_to_fill=None, fill_value=None,
                           columns_to_impute_mean=None, columns_to_impute_median=None,
                           remove_duplicate_subset=None, standardize_column_names=True):
    """
    Automates common data cleaning tasks: filling missing values, removing duplicates,
    and standardizing column names.

    Args:
        df (pd.DataFrame): The input DataFrame to be cleaned.
        missing_value_strategy (str, optional): Strategy for handling missing values.
            Options: 'fill_value', 'mean', 'median'. Defaults to None.
        columns_to_fill (list, optional): List of columns to fill with a specific value.
            Required if missing_value_strategy='fill_value'. Defaults to None.
        fill_value (any, optional): The value to fill missing values with.
            Required if missing_value_strategy='fill_value'. Defaults to None.
        columns_to_impute_mean (list, optional): List of columns to fill missing values
            with the mean. Required if missing_value_strategy='mean'. Defaults to None.
        columns_to_impute_median (list, optional): List of columns to fill missing values
            with the median. Required if missing_value_strategy='median'. Defaults to None.
        remove_duplicate_subset (list, optional): List of columns to consider when
            identifying duplicate rows. If None, all columns are considered.
            Defaults to None.
        standardize_column_names (bool, optional): Whether to standardize column names
            (lowercase and replace spaces with underscores). Defaults to True.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    cleaned_df = df.copy()

    # Handle Missing Values
    if missing_value_strategy == 'fill_value':
        if columns_to_fill is None or fill_value is None:
            print("Warning: 'columns_to_fill' and 'fill_value' must be specified for 'fill_value' strategy.")
        else:
            for col in columns_to_fill:
                if col in cleaned_df.columns:
                    cleaned_df[col].fillna(fill_value, inplace=True)
                else:
                    print(f"Warning: Column '{col}' not found in DataFrame for fillna.")
    elif missing_value_strategy == 'mean':
        if columns_to_impute_mean is None:
            print("Warning: 'columns_to_impute_mean' must be specified for 'mean' imputation.")
        else:
            for col in columns_to_impute_mean:
                if col in cleaned_df.columns:
                    cleaned_df[col].fillna(cleaned_df[col].mean(), inplace=True)
                else:
                    print(f"Warning: Column '{col}' not found in DataFrame for mean imputation.")
    elif missing_value_strategy == 'median':
        if columns_to_impute_median is None:
            print("Warning: 'columns_to_impute_median' must be specified for 'median' imputation.")
        else:
            for col in columns_to_impute_median:
                if col in cleaned_df.columns:
                    cleaned_df[col].fillna(cleaned_df[col].median(), inplace=True)
                else:
                    print(f"Warning: Column '{col}' not found in DataFrame for median imputation.")
    elif missing_value_strategy is not None:
        print(f"Warning: Unknown missing_value_strategy: '{missing_value_strategy}'. Skipping missing value handling.")

    # Remove Duplicates
    cleaned_df.drop_duplicates(subset=remove_duplicate_subset, inplace=True)

    # Standardize Column Names
    if standardize_column_names:
        cleaned_df.columns = cleaned_df.columns.str.lower().str.replace(' ', '_')

    return cleaned_df

# Sample DataFrame with missing values and duplicates
data = {'Name': ['Alice', 'Bob', 'Charlie', 'Bob', 'David', 'Eve', 'Alice'],
        'Age': [25, np.nan, 30, np.nan, 40, 22, 25],
        'City': ['New York', 'London', 'Paris', 'London', 'Tokyo', 'Sydney', 'New York'],
        'Salary': [50000, 60000, np.nan, 60000, 70000, 45000, 50000],
        'Data Column': [1, 2, 3, 2, 4, 5, 1]}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Clean the DataFrame
cleaned_df = automate_data_cleaning(
    df.copy(),
    missing_value_strategy='mean',
    columns_to_impute_mean=['Age', 'Salary'],
    remove_duplicate_subset=['Name', 'City'],
    standardize_column_names=True
)

print("\nCleaned DataFrame:")
print(cleaned_df)

# Example with different missing value strategy
cleaned_df_fill = automate_data_cleaning(
    df.copy(),
    missing_value_strategy='fill_value',
    columns_to_fill=['Age', 'Salary'],
    fill_value=0,
    remove_duplicate_subset=['Name', 'City'],
    standardize_column_names=True
)

print("\nCleaned DataFrame (filled with 0):")
print(cleaned_df_fill)

# Example with median imputation
cleaned_df_median = automate_data_cleaning(
    df.copy(),
    missing_value_strategy='median',
    columns_to_impute_median=['Age', 'Salary'],
    remove_duplicate_subset=['Name', 'City'],
    standardize_column_names=True
)

print("\nCleaned DataFrame (filled with median):")
print(cleaned_df_median)

# Example without standardizing column names
cleaned_df_no_std_cols = automate_data_cleaning(
    df.copy(),
    missing_value_strategy='mean',
    columns_to_impute_mean=['Age', 'Salary'],
    remove_duplicate_subset=['Name', 'City'],
    standardize_column_names=False
)

print("\nCleaned DataFrame (without standardized column names):")
print(cleaned_df_no_std_cols)

Original DataFrame:
      Name   Age      City   Salary  Data Column
0    Alice  25.0  New York  50000.0            1
1      Bob   NaN    London  60000.0            2
2  Charlie  30.0     Paris      NaN            3
3      Bob   NaN    London  60000.0            2
4    David  40.0     Tokyo  70000.0            4
5      Eve  22.0    Sydney  45000.0            5
6    Alice  25.0  New York  50000.0            1

Cleaned DataFrame:
      name   age      city        salary  data_column
0    Alice  25.0  New York  50000.000000            1
1      Bob  28.4    London  60000.000000            2
2  Charlie  30.0     Paris  55833.333333            3
4    David  40.0     Tokyo  70000.000000            4
5      Eve  22.0    Sydney  45000.000000            5

Cleaned DataFrame (filled with 0):
      name   age      city   salary  data_column
0    Alice  25.0  New York  50000.0            1
1      Bob   0.0    London  60000.0            2
2  Charlie  30.0     Paris      0.0            3
4    David  

In [9]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.



In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Sample DataFrame with a numeric column to normalize
data = {'Product': ['A', 'B', 'C', 'D', 'E'],
        'Price': [100, 250, 150, 300, 200]}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# 1. Initialize the MinMaxScaler
scaler = MinMaxScaler()

# 2. Fit the scaler to the data and transform the column
# MinMaxScaler expects a 2D array, so we reshape the 'Price' column
df['Price_Normalized'] = scaler.fit_transform(df[['Price']])

print("\nDataFrame with Min-Max Normalized 'Price' column:")
print(df)

# To understand the scaling, let's look at the min and max of the original and normalized columns
print(f"\nMinimum Price (Original): {df['Price'].min()}")
print(f"Maximum Price (Original): {df['Price'].max()}")
print(f"Minimum Price (Normalized): {df['Price_Normalized'].min()}")
print(f"Maximum Price (Normalized): {df['Price_Normalized'].max()}")

# You can also specify a different target range if needed
target_range_scaler = MinMaxScaler(feature_range=(0, 10)) # Normalize to a range between 0 and 10
df['Price_Normalized_0_10'] = target_range_scaler.fit_transform(df[['Price']])

print("\nDataFrame with Min-Max Normalized 'Price' column (range 0-10):")
print(df)
print(f"\nMinimum Price (Normalized 0-10): {df['Price_Normalized_0_10'].min()}")
print(f"Maximum Price (Normalized 0-10): {df['Price_Normalized_0_10'].max()}")

Original DataFrame:
  Product  Price
0       A    100
1       B    250
2       C    150
3       D    300
4       E    200

DataFrame with Min-Max Normalized 'Price' column:
  Product  Price  Price_Normalized
0       A    100              0.00
1       B    250              0.75
2       C    150              0.25
3       D    300              1.00
4       E    200              0.50

Minimum Price (Original): 100
Maximum Price (Original): 300
Minimum Price (Normalized): 0.0
Maximum Price (Normalized): 1.0

DataFrame with Min-Max Normalized 'Price' column (range 0-10):
  Product  Price  Price_Normalized  Price_Normalized_0_10
0       A    100              0.00                    0.0
1       B    250              0.75                    7.5
2       C    150              0.25                    2.5
3       D    300              1.00                   10.0
4       E    200              0.50                    5.0

Minimum Price (Normalized 0-10): 0.0
Maximum Price (Normalized 0-10): 10.0
