In [None]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.



In [None]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.



In [None]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.



In [None]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.



In [None]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.



In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Sample DataFrame for demonstration
data = {
    'Category': ['A', 'B', 'A', 'C', 'B', 'A', None, 'C'],
    'Value': [10, 20, np.nan, 40, 25, 15, 30, 45],
    'Amount': [100, 200, 150, 300, 220, 180, np.nan, 350],
    'Age': [25, 30, np.nan, 40, 35, 28, 45, 32.5],
    'Score': [0.8, 0.9, 0.75, 0.95, 0.85, 0.7, 1.0, 0.65]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

print("\n--- Question 1: Handling Missing Values with Conditional Filling ---")
# Fill missing 'Value' based on 'Category'
df['Value_Filled'] = df['Value'].fillna(df.groupby('Category')['Value'].transform('mean'))
print("\nDataFrame after conditional filling of 'Value':")
print(df)

print("\n--- Question 2: Removing Outliers by Rescaling ---")
# Remove outliers from 'Amount' using z-scores
df['Amount_ZScore'] = np.abs((df['Amount'] - df['Amount'].mean()) / df['Amount'].std())
df_no_outliers = df[df['Amount_ZScore'] <= 3].copy() # Keeping values within 3 standard deviations
print("\nDataFrame after removing 'Amount' outliers (based on z-score <= 3):")
print(df_no_outliers)

print("\n--- Question 3: Applying Data Type Conversion ---")
# Fill missing 'Age' with the mean and convert to integer
df_no_outliers['Age_Filled'] = df_no_outliers['Age'].fillna(df_no_outliers['Age'].mean()).astype(int)
print("\nDataFrame after filling missing 'Age' and converting to integer:")
print(df_no_outliers[['Age', 'Age_Filled']])

print("\n--- Question 4: Automating Data Cleaning with Functions ---")
def clean_data(df):
    """
    Automates filling missing values, removing duplicates, and standardizing column names.
    """
    df_cleaned = df.copy()

    # Fill missing values (example: fill all numerical with mean, categorical with mode)
    for col in df_cleaned.select_dtypes(include=np.number).columns:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mean())
    for col in df_cleaned.select_dtypes(include='object').columns:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else None)

    # Remove duplicates
    df_cleaned = df_cleaned.drop_duplicates()

    # Standardize column names (lowercase and replace spaces with underscores)
    df_cleaned.columns = df_cleaned.columns.str.lower().str.replace(' ', '_')

    return df_cleaned

df_cleaned_automated = clean_data(df)
print("\nDataFrame after automated cleaning:")
print(df_cleaned_automated)

print("\n--- Question 5: Complex Data Normalization ---")
# Normalize 'Score' column using Min-Max scaling
scaler = MinMaxScaler()
df_no_outliers['Score_Normalized'] = scaler.fit_transform(df_no_outliers[['Score']])
print("\nDataFrame with 'Score' column normalized using Min-Max scaling:")
print(df_no_outliers[['Score', 'Score_Normalized']])

Original DataFrame:
  Category  Value  Amount   Age  Score
0        A   10.0   100.0  25.0   0.80
1        B   20.0   200.0  30.0   0.90
2        A    NaN   150.0   NaN   0.75
3        C   40.0   300.0  40.0   0.95
4        B   25.0   220.0  35.0   0.85
5        A   15.0   180.0  28.0   0.70
6     None   30.0     NaN  45.0   1.00
7        C   45.0   350.0  32.5   0.65

--- Question 1: Handling Missing Values with Conditional Filling ---

DataFrame after conditional filling of 'Value':
  Category  Value  Amount   Age  Score  Value_Filled
0        A   10.0   100.0  25.0   0.80          10.0
1        B   20.0   200.0  30.0   0.90          20.0
2        A    NaN   150.0   NaN   0.75          12.5
3        C   40.0   300.0  40.0   0.95          40.0
4        B   25.0   220.0  35.0   0.85          25.0
5        A   15.0   180.0  28.0   0.70          15.0
6     None   30.0     NaN  45.0   1.00          30.0
7        C   45.0   350.0  32.5   0.65          45.0

--- Question 2: Removing Outlier