In [8]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

adult_df = pd.read_csv("/adult.csv")
diabetes_df = pd.read_csv("/diabetes.csv")

print("=== Adult Dataset: Raw ===")
print(adult_df.head(), "\n")

adult_df.replace("?", np.nan, inplace=True)

# Separate numerical and categorical columns
adult_num_cols = adult_df.select_dtypes(include=np.number).columns
adult_cat_cols = adult_df.select_dtypes(include='object').columns

print("Adult Numerical Columns:", list(adult_num_cols))
print("Adult Categorical Columns:", list(adult_cat_cols), "\n")

# Fill numerical missing values with median
adult_df[adult_num_cols] = adult_df[adult_num_cols].fillna(adult_df[adult_num_cols].median())

# Fill categorical missing values with mode
adult_df[adult_cat_cols] = adult_df[adult_cat_cols].fillna(adult_df[adult_cat_cols].mode().iloc[0])

print("=== Adult Dataset After Missing Value Handling ===")
print(adult_df.head(), "\n")

# Outlier handling
for col in adult_num_cols:
    Q1 = adult_df[col].quantile(0.25)
    Q3 = adult_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    adult_df[col] = np.clip(adult_df[col], lower, upper)

print("=== Adult Dataset After Outlier Clipping ===")
print(adult_df[adult_num_cols].describe(), "\n")

# Label Encoding
le = LabelEncoder()
for col in adult_cat_cols:
    adult_df[col] = le.fit_transform(adult_df[col])

print("=== Adult Dataset After Label Encoding ===")
print(adult_df.head(), "\n")

# Scaling
minmax_scaler = MinMaxScaler()
adult_minmax = minmax_scaler.fit_transform(adult_df)

standard_scaler = StandardScaler()
adult_standard = standard_scaler.fit_transform(adult_df)

print("Adult MinMax Scaled Shape:", adult_minmax.shape)
print("Adult MinMax Sample:\n", adult_minmax[:5], "\n")

print("Adult Standard Scaled Shape:", adult_standard.shape)
print("Adult Standard Sample:\n", adult_standard[:5], "\n")


# ===================== Diabetes =====================

print("=== Diabetes Dataset: Raw ===")
print(diabetes_df.head(), "\n")

diabetes_num_cols = diabetes_df.select_dtypes(include=np.number).columns
diabetes_cat_cols = diabetes_df.select_dtypes(include='object').columns

print("Diabetes Numerical Columns:", list(diabetes_num_cols))
print("Diabetes Categorical Columns:", list(diabetes_cat_cols), "\n")

# Numerical → Median
diabetes_df[diabetes_num_cols] = diabetes_df[diabetes_num_cols].fillna(
    diabetes_df[diabetes_num_cols].median()
)

# Categorical → Mode
diabetes_df[diabetes_cat_cols] = diabetes_df[diabetes_cat_cols].fillna(
    diabetes_df[diabetes_cat_cols].mode().iloc[0]
)

print("=== Diabetes Dataset After Missing Value Handling ===")
print(diabetes_df.head(), "\n")

# Outlier handling
for col in diabetes_num_cols:
    Q1 = diabetes_df[col].quantile(0.25)
    Q3 = diabetes_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    diabetes_df[col] = np.clip(diabetes_df[col], lower, upper)

print("=== Diabetes Dataset After Outlier Clipping ===")
print(diabetes_df[diabetes_num_cols].describe(), "\n")

# Label Encoding
for col in diabetes_cat_cols:
    diabetes_df[col] = le.fit_transform(diabetes_df[col])

print("=== Diabetes Dataset After Label Encoding ===")
print(diabetes_df.head(), "\n")

# Scaling
diabetes_minmax = minmax_scaler.fit_transform(diabetes_df)
diabetes_standard = standard_scaler.fit_transform(diabetes_df)

print("Diabetes MinMax Scaled Shape:", diabetes_minmax.shape)
print("Diabetes MinMax Sample:\n", diabetes_minmax[:5], "\n")

print("Diabetes Standard Scaled Shape:", diabetes_standard.shape)
print("Diabetes Standard Sample:\n", diabetes_standard[:5])


=== Adult Dataset: Raw ===
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-pe