In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

# 1. Load and Initial Inspection
df = pd.read_csv("/content/housing.csv")
print("--- Initial Inspection ---")
print("\ndf.head():")
print(df.head())
print("\ndf.info():")
df.info(verbose=True)
print("\ndf.describe():")
print(df.describe())
print("\ndf.isna().sum():")
print(df.isna().sum())
print("\ndf.shape:", df.shape)

# 2. Simple dropna
new_df = df.dropna()
print("\n--- Simple Dropna ---")
print("\nnew_df.head():")
print(new_df.head())
print("\nnew_df.shape:", new_df.shape)

# 3. SimpleImputer (Mean for numeric, Mode for categorical)
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include='object').columns
df_imputed = df.copy()

# Impute numerical columns with mean
num_imputer = SimpleImputer(strategy='mean')
df_imputed[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with most frequent value (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')
df_imputed[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print("\n--- Imputation Check ---")
print("\ndf_imputed.isna().sum():")
print(df_imputed.isna().sum())

# 4. IQR Outlier Detection and Removal
print("\n--- IQR Outlier Detection ---")
numeric_cols = df_imputed.select_dtypes(include=['int64', 'float64']).columns
outlier_mask = pd.Series(False, index=df_imputed.index) # Mask to mark outliers

for col in numeric_cols:
    Q1 = df_imputed[col].quantile(0.25)
    Q3 = df_imputed[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df_imputed[(df_imputed[col] < lower_bound) | (df_imputed[col] > upper_bound)]
    outlier_mask = outlier_mask | (df_imputed[col] < lower_bound) | (df_imputed[col] > upper_bound)
    print(f"{col}: {len(outliers)} outliers")

# Correct IQR Outlier Removal: remove rows that are outliers in *any* numeric column
df_iqr_removed = df_imputed[~outlier_mask].copy()
print("Shape after IQR outlier removal (rows with any numeric outlier dropped):", df_iqr_removed.shape)

# 5. Categorical Column Check (using the corrected DataFrame df_iqr_removed)
cat_cols_cleaned = df_iqr_removed.select_dtypes(include='object').columns
print("\n--- Categorical Columns in Cleaned Data ---")
print("Categorical columns:\n", cat_cols_cleaned.tolist())

# 6. Label Encoding
df_label_encoded = df_iqr_removed.copy()
le = LabelEncoder()

for col in cat_cols_cleaned:
    df_label_encoded[col] = le.fit_transform(df_label_encoded[col])

print("\n--- Label Encoding ---")
print("\ndf_label_encoded.head():")
print(df_label_encoded.head())

# 7. One-Hot Encoding
df_one_hot = pd.get_dummies(df_iqr_removed, columns=cat_cols_cleaned, drop_first=True)
# Convert boolean columns created by get_dummies to int (0 or 1)
bool_cols = df_one_hot.select_dtypes(include='bool').columns
df_one_hot[bool_cols] = df_one_hot[bool_cols].astype(int)

print("\n--- One-Hot Encoding ---")
print("\nOne-hot encoded dataset shape:", df_one_hot.shape)
print("\nPreview of encoded data:")
print(df_one_hot.head())

# 8. Data Scaling (MinMax, Z-score, Decimal)
# Define numeric_df from the IQR-removed dataframe for scaling
numeric_df = df_iqr_removed.select_dtypes(include=['int64', 'float64'])

# a. MinMax Scaling
min_max_scaler = MinMaxScaler()
min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(numeric_df), columns=numeric_df.columns)

print("\n--- MinMax Scaling ---")
print("\nmin_max_scaled.head():")
print(min_max_scaled.head())

# b. Z-score Scaling
zscore_scaler = StandardScaler()
zscore_scaled = pd.DataFrame(zscore_scaler.fit_transform(numeric_df), columns=numeric_df.columns)

print("\n--- Z-score Scaling ---")
print("\nzscore_scaled.head():")
print(zscore_scaled.head())

# c. Decimal Scaling
def decimal_scaling(df_to_scale):
    scaled_df = df_to_scale.copy()
    for col in scaled_df.columns:
        # Use abs().max() to find the largest absolute value
        max_abs = scaled_df[col].abs().max()
        # Find the number of digits (j)
        j = len(str(int(max_abs))) if max_abs >= 1 else 0
        scaled_df[col] = scaled_df[col] / (10 ** j)
    return scaled_df

decimal_scaled = decimal_scaling(numeric_df)

print("\n--- Decimal Scaling ---")
print("\ndecimal_scaled.head():")
print(decimal_scaled.head())

--- Initial Inspection ---

df.head():
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

df.info():
<class 'pandas.core.fram