In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import json
from tqdm.notebook import tqdm

In [2]:
with open("feature_dict.json", "r") as json_file:
    feature_dict = json.load(json_file)

In [3]:
df = pd.read_csv('train_prepared_for_cv.csv')

In [4]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 80, dtype: object

In [5]:
sum(df.isna().sum())

357

In [6]:
X = df.drop(columns=['Id', 'SalePrice'])
y = df['SalePrice']

In [7]:
numeric_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
numeric_cols.remove("MSSubClass")
categorical_cols = [col for col in X.columns if col not in numeric_cols]
ordinal_encode_cols = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC", "Street", "Alley", "PavedDrive"]
label_encode_cols = [col for col in categorical_cols if col not in ordinal_encode_cols]
ohe_encode_cols = label_encode_cols

In [8]:
print(numeric_cols)

['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [9]:
print(categorical_cols)

['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [10]:
print(ordinal_encode_cols)

['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'Street', 'Alley', 'PavedDrive']


In [11]:
print(label_encode_cols)

['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [12]:
print(len(categorical_cols))
print(len(ordinal_encode_cols), len(label_encode_cols), len(ohe_encode_cols))
print(len(ordinal_encode_cols) + len(label_encode_cols))
print(len(numeric_cols))
print(len(numeric_cols) + len(categorical_cols))

43
14 29 29
43
35
78


In [13]:
ordinal_mappings = {}

for col in ordinal_encode_cols:
    if col in X.columns and col in feature_dict:
        all_values = list(feature_dict[col]["values"].keys())  # Extract and preserve the order of values from the dict
        mapping = {category: rank for rank, category in enumerate(reversed(all_values), start=1)}  # Encode in the dict order
        mapping["No_item"] = -1
        if "NA" in mapping.keys():
            del mapping["NA"]
        ordinal_mappings[col] = mapping
        
def safe_map(value, mapping):
    if value not in mapping:
        raise ValueError(f"Value '{value}' not found in the mapping!")
    return mapping[value]

def ordinal_encode_column(X, mappings):
    encoded = X.copy()
    for col in X.columns:
        if col in mappings:
            encoded[col] = X.apply(lambda x: safe_map(x, mappings[col]))
    return encoded

print(ordinal_mappings)

{'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'BsmtQual': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'BsmtCond': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'BsmtExposure': {'No': 2, 'Mn': 3, 'Av': 4, 'Gd': 5, 'No_item': -1}, 'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'FireplaceQu': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'GarageQual': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'GarageCond': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'PoolQC': {'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'Street': {'Grvl': 1, 'Pave': 2, 'No_item': -1}, 'Alley': {'Grvl': 2, 'Pave': 3, 'No_item': -1}, 'PavedDrive': {'N': 1, 'P': 2, 'Y': 3, 'No_item': -1}}


In [14]:
label_mappings = {}

for col in label_encode_cols:
    if col in X.columns and col in feature_dict:
        all_values = list(feature_dict[col]["values"].keys())  # Extract and preserve the order of values from the dict
        mapping = {category: rank for rank, category in enumerate(reversed(all_values), start=1)}  # Encode in the dict order
        mapping["No_item"] = -1
        if "NA" in mapping.keys():
            del mapping["NA"]
        label_mappings[col] = mapping
        
def safe_map(value, mapping):
    if value not in mapping:
        raise ValueError(f"Value '{value}' not found in the mapping!")
    return mapping[value]

def label_encode_column(X, mappings):
    encoded = X.copy()
    for col in X.columns:
        if col in mappings:
            encoded[col] = df[col].apply(lambda x: safe_map(x, mappings[col]))
    return encoded

print(label_mappings)

{'MSSubClass': {'190': 1, '180': 2, '160': 3, '150': 4, '120': 5, '90': 6, '85': 7, '80': 8, '75': 9, '70': 10, '60': 11, '50': 12, '45': 13, '40': 14, '30': 15, '20': 16, 'No_item': -1}, 'MSZoning': {'RM': 1, 'RP': 2, 'RL': 3, 'RH': 4, 'I': 5, 'FV': 6, 'C': 7, 'A': 8, 'No_item': -1}, 'LotShape': {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4, 'No_item': -1}, 'LandContour': {'Low': 1, 'HLS': 2, 'Bnk': 3, 'Lvl': 4, 'No_item': -1}, 'Utilities': {'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4, 'No_item': -1}, 'LotConfig': {'FR3': 1, 'FR2': 2, 'CulDSac': 3, 'Corner': 4, 'Inside': 5, 'No_item': -1}, 'LandSlope': {'Sev': 1, 'Mod': 2, 'Gtl': 3, 'No_item': -1}, 'Neighborhood': {'Veenker': 1, 'Timber': 2, 'StoneBr': 3, 'Somerst': 4, 'SawyerW': 5, 'Sawyer': 6, 'SWISU': 7, 'OldTown': 8, 'NWAmes': 9, 'NridgHt': 10, 'NPkVill': 11, 'NoRidge': 12, 'Names': 13, 'Mitchel': 14, 'MeadowV': 15, 'IDOTRR': 16, 'Gilbert': 17, 'Edwards': 18, 'Crawfor': 19, 'CollgCr': 20, 'ClearCr': 21, 'BrkSide': 22, 'BrDale': 23, 

In [15]:
class DataFramePreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_cols, ordinal_cols, ohe_cols, label_cols, ordinal_mappings, label_mappings, scale_features_except_ohe=False):
        self.numeric_cols = numeric_cols
        self.ordinal_cols = ordinal_cols
        self.ohe_cols = ohe_cols
        self.label_cols = label_cols
        self.ordinal_mappings = ordinal_mappings
        self.label_mappings = label_mappings
        self.ohe_columns_names = None
        self.medians = {}
        self.modes = {}
        self.scaler = StandardScaler()
        self.scale_features_except_ohe = scale_features_except_ohe

    def fit(self, X, y=None):
        self.medians = X[self.numeric_cols].median()
        self.modes = X[self.ordinal_cols + self.ohe_cols + self.label_cols].mode().iloc[0]
        if self.ohe_cols:
            self.ohe_columns_names = []
            for col in self.ohe_cols:
                if col in feature_dict:
                    categories = feature_dict[col]["values"].keys()
                    for category in categories:
                        self.ohe_columns_names.append(f"{col}_{category}")
                else:
                    raise KeyError(f"Feature {col} not found in feature dictionary")
        return self

    def transform(self, X):
        X = X.copy()
        # filling NA
        for col in self.numeric_cols:
            X[col] = X[col].fillna(self.medians[col])
    
        if X.columns.duplicated().any():
                raise ValueError(f"Duplicate column names found: {X.columns[X.columns.duplicated()].tolist()}")

        for col in self.ordinal_cols + self.ohe_cols + self.label_cols:
            X[col] = X[col].fillna(self.modes[col])
        
        for col in self.ordinal_cols:
            if col in self.ordinal_mappings:
                X[col] = X[col].map(self.ordinal_mappings[col]).fillna(-1)
            else:
                raise KeyError
        
        for col in self.label_cols:
            if col in self.label_mappings:
                X[col] = X[col].map(self.label_mappings[col]).fillna(-1)
            else:
                raise KeyError
        
        if self.scale_features_except_ohe == True:
            X[self.label_cols + self.ordinal_cols +self.numeric_cols] = self.scaler.fit_transform(X[self.label_cols + self.ordinal_cols +self.numeric_cols])

        
        if self.ohe_cols:
            ohe_encoded = pd.DataFrame(0, index=X.index, columns=self.ohe_columns_names)
            for col in self.ohe_cols:
                if col in feature_dict:
                    categories = feature_dict[col]["values"].keys()
                    for category in categories:
                        ohe_col_name = f"{col}_{category}"
                        ohe_encoded[ohe_col_name] = (X[col] == category).astype(int)
                else:
                    raise KeyError(f"Feature {col} not found in feature dictionary")
            X = X.drop(columns=self.ohe_cols)
            X = pd.concat([X, ohe_encoded], axis=1)
            
        return X

In [16]:
preprocessor = DataFramePreprocessor(
    numeric_cols=numeric_cols,
    ordinal_cols=ordinal_encode_cols,
    ohe_cols=ohe_encode_cols,
    label_cols=[],
    ordinal_mappings=ordinal_mappings,
    label_mappings=label_mappings,
    # scale_features_except_ohe=True
)

In [17]:
# testing preprocessing
processed_data = preprocessor.fit_transform(X)

In [18]:
processed_data

Unnamed: 0,LotFrontage,LotArea,Street,Alley,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,SaleType_ConLw,SaleType_ConLI,SaleType_ConLD,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
0,65.0,8450,2,-1,7,5,2003,2003,196.0,4,...,0,0,0,0,1,0,0,0,0,0
1,80.0,9600,2,-1,6,8,1976,1976,0.0,3,...,0,0,0,0,1,0,0,0,0,0
2,68.0,11250,2,-1,7,5,2001,2002,162.0,4,...,0,0,0,0,1,0,0,0,0,0
3,60.0,9550,2,-1,7,5,1915,1970,0.0,3,...,0,0,0,0,0,1,0,0,0,0
4,84.0,14260,2,-1,8,5,2000,2000,350.0,4,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,62.0,7917,2,-1,6,5,1999,2000,0.0,3,...,0,0,0,0,1,0,0,0,0,0
1452,85.0,13175,2,-1,6,6,1978,1988,119.0,3,...,0,0,0,0,1,0,0,0,0,0
1453,66.0,9042,2,-1,7,9,1941,2006,0.0,5,...,0,0,0,0,1,0,0,0,0,0
1454,68.0,9717,2,-1,5,6,1950,1996,0.0,3,...,0,0,0,0,1,0,0,0,0,0


Label encoding for cat features

In [190]:
preprocessor = DataFramePreprocessor(
    numeric_cols=numeric_cols,
    ordinal_cols=ordinal_encode_cols,
    ohe_cols=[],
    label_cols=label_encode_cols,
    ordinal_mappings=ordinal_mappings,
    label_mappings=label_mappings
)

In [21]:
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'Open

In [None]:
model = RandomForestRegressor(random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2 = {"train": [], "test": []}
mae = {"train": [], "test": []}
feature_importances = pd.DataFrame(index=X.columns)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    preprocessor.fit(X_train)
    X_train_preprocessed = preprocessor.transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)
    
    model.fit(X_train_preprocessed, y_train)
    
    y_train_pred = model.predict(X_train_preprocessed)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = model.score(X_train_preprocessed, y_train)
    
    y_test_pred = model.predict(X_test_preprocessed)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = model.score(X_test_preprocessed, y_test)
    
    r2["train"].append(train_r2)
    r2["test"].append(test_r2)
    mae["train"].append(train_mae)
    mae["test"].append(test_mae)

    
    fold_importances = pd.Series(model.feature_importances_, index=X.columns)
    feature_importances = pd.concat([feature_importances, fold_importances], axis=1)

feature_importances['mean_importance'] = feature_importances.mean(axis=1)
feature_importances = feature_importances[['mean_importance']].sort_values(by='mean_importance', ascending=False)

In [None]:
pd.DataFrame.from_dict(mae)

In [None]:
pd.DataFrame.from_dict(r2)

In [None]:
feature_importances.sort_values(by="mean_importance", ascending=False)

In [39]:
preprocessor_lasso_label = DataFramePreprocessor(
    numeric_cols=numeric_cols,
    ordinal_cols=ordinal_encode_cols,
    ohe_cols=[],
    label_cols=label_encode_cols,
    ordinal_mappings=ordinal_mappings,
    label_mappings=label_mappings,
    scale_features_except_ohe=True
)

In [26]:
alpha_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

kf = KFold(n_splits=9, shuffle=True, random_state=42)

r2 = {"train": [], "test": []}
mae = {"train": [], "test": []}
best_alphas = []
feature_importances = pd.DataFrame(index=X.columns)

for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    preprocessor_lasso.fit(X_train)
    X_train_preprocessed = preprocessor_lasso.transform(X_train)
    X_test_preprocessed = preprocessor_lasso.transform(X_test)
    
    lasso = Lasso(random_state=42)
    grid_search = GridSearchCV(estimator=lasso, param_grid=alpha_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train_preprocessed, y_train)
    
    best_alpha = grid_search.best_params_['alpha']
    best_alphas.append(best_alpha)
    
    model = Lasso(alpha=best_alpha, random_state=42)
    model.fit(X_train_preprocessed, y_train)
    
    y_train_pred = model.predict(X_train_preprocessed)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = model.score(X_train_preprocessed, y_train)
    
    y_test_pred = model.predict(X_test_preprocessed)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = model.score(X_test_preprocessed, y_test)
    
    r2["train"].append(train_r2)
    r2["test"].append(test_r2)
    mae["train"].append(train_mae)
    mae["test"].append(test_mae)
    
    fold_importances = pd.Series(np.abs(model.coef_), index=X.columns)
    feature_importances = pd.concat([feature_importances, fold_importances], axis=1)

feature_importances['mean_importance'] = feature_importances.mean(axis=1)
feature_importances = feature_importances[['mean_importance']].sort_values(by='mean_importance', ascending=False)


0it [00:00, ?it/s]

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [31]:
best_alphas

[100, 100, 100, 100, 1000, 100, 100, 100, 100]

In [38]:
pd.DataFrame(r2).mean()

train    0.899974
test     0.874307
dtype: float64

In [35]:
pd.DataFrame(mae).mean()

train    17048.802976
test     19522.998889
dtype: float64

In [29]:
feature_importances

Unnamed: 0,mean_importance
GrLivArea,25617.443412
OverallQual,14329.834333
TotalBsmtSF,9172.804663
BsmtFinSF1,8669.111080
MasVnrArea,7712.261305
...,...
Heating,48.671948
GarageQual,39.777333
MiscFeature,21.619976
BsmtUnfSF,0.000000


In [None]:
# kind of works but clearly with this encoding does not capture the important categorical features such as neighborhood

In [40]:
preprocessor_lasso_ohe = DataFramePreprocessor(
    numeric_cols=numeric_cols,
    ordinal_cols=ordinal_encode_cols,
    ohe_cols=label_encode_cols,
    label_cols=[],
    ordinal_mappings=ordinal_mappings,
    label_mappings=label_mappings,
    scale_features_except_ohe=True
)

In [48]:
alpha_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

kf = KFold(n_splits=9, shuffle=True, random_state=42)

r2 = {"train": [], "test": []}
mae = {"train": [], "test": []}
best_alphas = []
feature_importances = pd.DataFrame(index=X.columns)

for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    preprocessor_lasso_ohe.fit(X_train)
    X_train_preprocessed = preprocessor_lasso_ohe.transform(X_train)
    X_test_preprocessed = preprocessor_lasso_ohe.transform(X_test)
    
    lasso = Lasso(random_state=42)
    grid_search = GridSearchCV(estimator=lasso, param_grid=alpha_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train_preprocessed, y_train)
    
    best_alpha = grid_search.best_params_['alpha']
    best_alphas.append(best_alpha)
    
    model = Lasso(alpha=best_alpha, random_state=42)
    model.fit(X_train_preprocessed, y_train)
    
    y_train_pred = model.predict(X_train_preprocessed)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = model.score(X_train_preprocessed, y_train)
    
    y_test_pred = model.predict(X_test_preprocessed)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = model.score(X_test_preprocessed, y_test)
    
    r2["train"].append(train_r2)
    r2["test"].append(test_r2)
    mae["train"].append(train_mae)
    mae["test"].append(test_mae)
    
    fold_importances = pd.Series(np.abs(model.coef_), index=X_train_preprocessed.columns)
    feature_importances = pd.concat([feature_importances, fold_importances], axis=1)

feature_importances['mean_importance'] = feature_importances.mean(axis=1)
feature_importances = feature_importances[['mean_importance']].sort_values(by='mean_importance', ascending=False)

0it [00:00, ?it/s]

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [49]:
best_alphas

[100, 100, 100, 100, 100, 100, 100, 100, 100]

In [51]:
pd.DataFrame(r2).mean()

train    0.91981
test     0.89023
dtype: float64

In [50]:
print(pd.DataFrame(mae).mean())

train    15154.029586
test     18055.670719
dtype: float64


In [52]:
feature_importances

Unnamed: 0,mean_importance
Neighborhood_StoneBr,31102.281295
GrLivArea,24535.007980
Neighborhood_NridgHt,24360.619929
Neighborhood_Crawfor,16746.789738
Neighborhood_NoRidge,16164.901021
...,...
GarageFinish,
Fence,
MiscFeature,
SaleType,
