In [None]:
# Cell 1: imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures


In [None]:
# Cell 2: load messy CSV
df = pd.read_csv('Task4/messy_dataset.csv')  # supply your file
df.info()
df.head()


In [None]:
# Cell 3: missing data summary
missing = df.isna().sum().sort_values(ascending=False)
missing[missing > 0]


In [None]:
# Cell 4: basic imputation
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()

# numeric: median
imp_num = SimpleImputer(strategy='median')
df[num_cols] = imp_num.fit_transform(df[num_cols])

# categorical: fill with 'missing'
df[cat_cols] = df[cat_cols].fillna('missing')


In [None]:
# Cell 5: outlier detection (IQR) for numeric columns
def remove_outliers_iqr(df, cols, factor=1.5):
    df2 = df.copy()
    for c in cols:
        q1 = df2[c].quantile(0.25)
        q3 = df2[c].quantile(0.75)
        iqr = q3 - q1
        low = q1 - factor * iqr
        high = q3 + factor * iqr
        # Option: clip rather than drop
        df2[c] = df2[c].clip(low, high)
    return df2

df = remove_outliers_iqr(df, num_cols)


In [None]:
# Cell 6: feature engineering - polynomial & interactions
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_num = df[num_cols]
X_poly = poly.fit_transform(X_num)
poly_feature_names = poly.get_feature_names_out(num_cols)
df_poly = pd.DataFrame(X_poly, columns=poly_feature_names, index=df.index)

# drop extremely collinear or constant columns as needed
df_model = pd.concat([df_poly, df[cat_cols]], axis=1)
df_model.to_csv('Task4/cleaned_dataset.csv', index=False)
