In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



df = pd.read_csv('housing.csv')
print(df.head(2))

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# ---- 1. pick a column ----------------------------------------------------
col = 'total_rooms'
train = df[[col]].copy()                        # fit only on training split!

# ---- 2. derive empirical cut-offs ----------------------------------------
lo, hi = train[col].quantile([0.10, 0.90])      # 10th / 90th pct

# ---- 3. winsorize (= clip to those cut-offs) -----------------------------
train[col + '_w'] = train[col].clip(lo, hi)

# ---- 4. (optional) scale the capped data to [-1, 1] ----------------------
scaler = MinMaxScaler(feature_range=(-1, 1))
train[col + '_scaled'] = scaler.fit_transform(train[[col + '_w']])

print(train.head(5))

   total_rooms  total_rooms_w  total_rooms_scaled
0        880.0          941.0           -1.000000
1       7099.0         4651.5            1.000000
2       1467.0         1467.0           -0.716480
3       1274.0         1274.0           -0.820509
4       1627.0         1627.0           -0.630239


In [None]:
# ---- 5. reuse the same transform on any future data ----------------------
def apply_winsor_scale(new_df):
    new_df = new_df.copy()
    new_df[col + '_w']      = new_df[col].clip(lo, hi)
    new_df[col + '_scaled'] = scaler.transform(new_df[[col + '_w']])
    return new_df

w_df = apply_winsor_scale(df['total_rooms'].copy().to_frame())
print(w_df.head(5))

   total_rooms  total_rooms_w  total_rooms_scaled
0        880.0          941.0           -1.000000
1       7099.0         4651.5            1.000000
2       1467.0         1467.0           -0.716480
3       1274.0         1274.0           -0.820509
4       1627.0         1627.0           -0.630239
