# PipeLine and Model Training

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        df = df.drop(columns=['Id', 'GarageConstructionYear', 'SaleMonth', 'FenceQuality', 'AdditionalFeatureValue', 'AdditionalFeature'])

        building_category_mapping = {60: 'A', 20: 'B', 70: 'C', 50: 'D', 190: 'E', 45: 'F', 90: 'G',
                                     120: 'H', 30: 'I', 85: 'J', 80: 'K', 160: 'L', 75: 'M', 180: 'N', 40: 'O'}
        df['BuildingCategory'] = df['BuildingCategory'].map(building_category_mapping)

        median_street_length = df.groupby('District')['StreetLineLength'].transform('median')
        df['StreetLineLength'] = df['StreetLineLength'].fillna(median_street_length)
        df['AlleyAccessType'] = df['AlleyAccessType'].fillna('None').astype('category')
        df['MasonrySize'] = df['MasonrySize'].fillna(0)
        df['MasonryType'] = np.where(df['MasonrySize'] == 0, 'Na', df['MasonryType'])

        basement_cols = ['BasementHeight', 'BasementCondition', 'BasementAccess', 'BasementFinish1',
                         'BasementFinishedArea1', 'BasementFinish2', 'BasementFinishedArea2', 'BasementUnfinishedArea']
        df.loc[df['TotalBasementArea'] == 0, basement_cols] = df.loc[df['TotalBasementArea'] == 0, basement_cols].fillna(
            {'BasementHeight': 'Na', 'BasementCondition': 'Na', 'BasementAccess': 'Na', 'BasementFinish1': 'Na',
             'BasementFinishedArea1': 0, 'BasementFinish2': 'Na', 'BasementFinishedArea2': 0, 'BasementUnfinishedArea': 0})

        garage_cols = ['GarageInterior', 'GarageLocation', 'GarageQuality', 'GarageCondition']
        df.loc[df['GarageSize'] == 0, garage_cols] = df.loc[df['GarageSize'] == 0, garage_cols].fillna(
            {'GarageInterior': 'Na', 'GarageLocation': 'Na', 'GarageQuality': 'Na', 'GarageCondition': 'Na'})

        df['PoolQuality'] = np.where(df['PoolSize'] == 0, 'Na', df['PoolQuality'])
        df['FireplaceQuality'] = np.where(df['FireplaceCount'] == 0, 'Na', df['FireplaceQuality'])
    

        df['TotalBasementBathrooms'] = df['BasementFullBathrooms'] + df['BasementHalfBathrooms']
        df = df.drop(columns=['BasementFullBathrooms', 'BasementHalfBathrooms'])

        df['TotalBathrooms'] = df['FullBathrooms'] + df['HalfBathrooms']
        df = df.drop(columns=['FullBathrooms', 'HalfBathrooms'])

        df['PropertyAge'] = df['SaleYear'] - df['ConstructionYear']
        df['RenovationTime'] = df['SaleYear'] - df['RenovationYear']
        df = df.drop(columns=['SaleYear', 'ConstructionYear', 'RenovationYear'])

        return df

categorical_features = [
    'BuildingCategory', 'ZoningClassification', 'RoadAccessType', 'AlleyAccessType', 'ParcelShape',
    'TerrainFlatness', 'UtilityAvailability', 'ParcelSettings', 'District', 'RoadProximity1',
    'RoadProximity2', 'DwellingType', 'DwellingStyle', 'RoofType', 'RoofMaterial', 'ExteriorCladding1',
    'ExteriorCladding2', 'MasonryType', 'ExteriorQuality', 'ExteriorCondition', 'FoundationType',
    'BasementHeight', 'BasementCondition', 'BasementAccess', 'BasementFinish1', 'BasementFinish2',
    'HeatingType', 'HeatingQuality', 'AirConditioning', 'ElectricalSystem', 'KitchenQuality',
    'FunctionalityRating', 'FireplaceQuality', 'GarageLocation', 'GarageInterior', 'GarageQuality',
    'GarageCondition', 'DrivewayType', 'PoolQuality', 'SaleType', 'SaleCondition'
]
ordinal_columns = [
    'TerrainSlope', 'KitchenQuality', 'ExteriorQuality', 'HeatingQuality',
    'FunctionalityRating', 'FireplaceQuality', 'ExteriorCondition',
    'BasementHeight', 'BasementCondition', 'GarageQuality', 'GarageCondition',
    'PoolQuality'
]

numerical_features = [
    'StreetLineLength', 'ParcelSize', 'MaterialQuality', 'ConditionRating', 'RenovationTime',
    'MasonrySize', 'BasementFinishedArea1', 'BasementFinishedArea2', 'BasementUnfinishedArea',
    'TotalBasementArea', 'GroundFloorArea', 'UpperFloorArea', 'LowQualityArea', 'LivingArea',
    'TotalBasementBathrooms', 'TotalBathrooms', 'BedroomAbvGr', 'KitchenAbvGr', 'TotalRooms',
    'FireplaceCount', 'GarageCapacity', 'GarageSize', 'WoodDeckArea', 'OpenPorchArea', 'EnclosedPorchArea',
    'ThreeSeasonPorchArea', 'ScreenPorchArea', 'PoolSize', 'PropertyAge'
]

ordinal_mapping = {
    'TerrainSlope': ['Sev', 'Mod', 'Gtl'],
    'KitchenQuality': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExteriorQuality': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQuality': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FunctionalityRating': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min1', 'Min2', 'Typ'],
    'FireplaceQuality': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExteriorCondition': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BasementHeight': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BasementCondition': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQuality': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCondition': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PoolQuality': ['Na', 'Fa', 'Gd', 'Ex'],
}


# Create imputers for categorical and numerical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
numerical_imputer = SimpleImputer(strategy='median')

# Create pipelines for each type of feature
numerical_pipeline = Pipeline([
    ('imputer', numerical_imputer),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', categorical_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline([
    ('imputer', categorical_imputer),
    ('encoder', OrdinalEncoder(categories=[ordinal_mapping[col] for col in ordinal_columns]))
])

# ColumnTransformer to apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, [col for col in categorical_features if col not in ordinal_columns]),
        ('ord', ordinal_pipeline, ordinal_columns)
    ])

# Full pipeline
pipeline = Pipeline(steps=[
    ('custom', CustomTransformer()),
    ('preprocessor', preprocessor)
])

In [41]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

X = df.drop('OutcomeVariable', axis=1)
y = df['OutcomeVariable'].copy()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Process train and test sets separately
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

In [42]:
X_processed = pipeline.fit_transform(X)

## LinearRegression

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Linear Regression
base_model = LinearRegression()
base_model.fit(X_train_processed, y_train)
y_pred_base = base_model.predict(X_test_processed)
print("Base Model R2 Score:", r2_score(y_test, y_pred_base))
print("Base Model RMSE Score:", mean_squared_error(y_test, y_pred_base, squared=False))

Base Model R2 Score: 0.8837259211174341
Base Model RMSE Score: 29864.027097320424


## Ridge

In [None]:
from sklearn.linear_model import Ridge

# Train Ridge Regression
r = Ridge()
r.fit(X_train_processed, y_train)
y_pred = r.predict(X_test_processed)
current_r2_score = r2_score(y_test, y_pred)

print("Ridge R2 Score:", current_r2_score)

Ridge R2 Score: 0.8831355558883749


## Lasso

In [None]:
from sklearn.linear_model import Lasso

# Train Lasso Regression
l = Lasso()
l.fit(X_train_processed, y_train)
y_pred = r.predict(X_test_processed)
current_r2_score = r2_score(y_test, y_pred)

print("Lasso R2 Score:", current_r2_score)

Lasso R2 Score: 0.8831355558883749


  model = cd_fast.sparse_enet_coordinate_descent(


## DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the RandomForestRegressor
tree_reg = DecisionTreeRegressor(random_state= 42)

# Fit the model to the training data
tree_reg.fit(X_train_processed, y_train)

# Predict the target variable on the test set
y_pred = tree_reg.predict(X_test_processed)

# Compute RMSE and R2 scores
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Random Forest RMSE:", rmse)
print("Random Forest R2 Score:", r2)

Random Forest RMSE: 34185.51046095447
Random Forest R2 Score: 0.8476402314700777


## RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
forest_reg.fit(X_train_processed, y_train)

# Predict the target variable on the test set
y_pred = forest_reg.predict(X_test_processed)

# Compute RMSE and R2 scores
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Random Forest RMSE:", rmse)
print("Random Forest R2 Score:", r2)

Random Forest RMSE: 28311.93151457296
Random Forest R2 Score: 0.8954978648320329


## RandomForestRegressor w CrossVal

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Initialize the RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform cross-validation with R-squared scoring
forest_scores = cross_val_score(forest_reg, X_processed, y, scoring='r2', cv=10)

# Print the scores
print("Cross-validated R-squared scores:", forest_scores)
print("Mean R-squared:", forest_scores.mean())
print("Standard deviation of R-squared:", forest_scores.std())

Cross-validated R-squared scores: [0.86403803 0.89211303 0.92036809 0.77215807 0.88166662 0.88707923
 0.88732046 0.88372802 0.80909046 0.85875455]
Mean R-squared: 0.865631656695653
Standard deviation of R-squared: 0.04146853400660071


## SVR

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

# Parameters provided
params = {
    'svr__C': 100,
    'svr__degree': 2,
    'svr__epsilon': 1,
    'svr__gamma': 'scale',
    'svr__kernel': 'linear'
}

# Create the SVR model
svr = SVR(C=params['svr__C'],
          degree=params['svr__degree'],
          epsilon=params['svr__epsilon'],
          gamma=params['svr__gamma'],
          kernel=params['svr__kernel'])

# Alternatively, if not using a pipeline, just use the SVR instance directly
svr.fit(X_train_processed, y_train)


# Make predictions
y_pred = svr.predict(X_test_processed)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate R² score
r2 = r2_score(y_test, y_pred)

# Print the scores
print(f"RMSE: {rmse}")
print(f"R²: {r2}")


RMSE: 35450.152820756455
R²: 0.8361590757590672


## GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train Gradient Boosting Regression
model = GradientBoostingRegressor()
model.fit(X_train_processed, y_train)
y_pred = model.predict(X_test_processed)
current_r2_score = r2_score(y_test, y_pred)

# Compute RMSE and R2 scores
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Best Model RMSE:", rmse)
print("Best Model R2 Score:", r2)

Best Model RMSE: 26400.46011609426
Best Model R2 Score: 0.9091323790520921


## xgboost with HP Tuning

In [48]:
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the search spaces for hyperparameters
search_spaces = {
    'max_depth': (3, 15),
    'n_estimators': (50, 500),
    'learning_rate': (0.01, 0.5, 'log-uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0)
}

# Initialize the XGBoost regressor
xgb_reg = XGBRegressor(random_state=42)

# Perform Bayesian hyperparameter optimization
bayes_search = BayesSearchCV(xgb_reg, search_spaces, n_iter=100, cv=None, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
bayes_search.fit(X_train_processed, y_train)

# Get the best model from the search
best_xgb_reg = bayes_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_xgb_reg.predict(X_test_processed)

# Compute RMSE and R2 scores
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Best Model Parameters:", best_xgb_reg.get_params())
print("Best Model RMSE:", rmse)
print("Best Model R2 Score:", r2)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [50]:
best_params = best_xgb_reg.get_params()
new_xgb_reg = XGBRegressor(**best_params)

new_xgb_reg.fit(X_processed, y)

y_pred = new_xgb_reg.predict(X_processed)

rmse = mean_squared_error(y, y_pred, squared=False)
r2 = r2_score(y, y_pred)

print("Best Model RMSE:", rmse)
print("Best Model R2 Score:", r2)

Best Model RMSE: 10773.78556777796
Best Model R2 Score: 0.981595315287704


In [51]:
import joblib

# Assuming best_svr is the best model you want to save
joblib.dump(new_xgb_reg, 'final_best_xgb_reg_bsht_hw.pkl')

['final_best_xgb_reg_bsht_hw.pkl']

In [4]:
results = pd.DataFrame(columns=['Model', 'R2', 'Improvement', 'Reason'])

# Function to add a row to the DataFrame
def add_result(model_name, r2, improvement, reason):
    global results  # Ensure we're modifying the global DataFrame
    new_row = {
        'Model': model_name,
        'R2': r2,
        'Improvement': improvement,
        'Reason': reason
    }
    results = results.append(new_row, ignore_index=True)

# Example usage of the function
add_result('LinearRegression', 0.8837, 'No', 'Establishing a baseline with a simple, interpretable model')
add_result('Ridge', 0.8831, 'No', 'Evaluating regularization to prevent overfitting in linear models')
add_result('Lasso', 0.8831, 'No', 'Testing feature selection capabilities via regularization')
add_result('DecisionTreeRegressor', 0.8476, 'No', 'Exploring non-linear relationships with a simple tree model')
add_result('RandomForestRegressor', 0.8954, 'Yes', 'Improving performance using ensemble learning with multiple trees')
add_result('RandomForestRegressor w CrossVal', 0.8656, 'No', 'a Assessing model stability and performance with cross-validation')
add_result('SVR', 0.8361, 'No', 'Investigating the performance of Support Vector Regression for non-linear data')
add_result('GradientBoostingRegressor', 0.9091, 'Yes', 'Enhancing prediction accuracy with gradient boosting technique')
add_result('xgboost with HP Tuning', 0.9262, 'Yes', 'Optimizing performance with a powerful gradient boosting ensemble method')

  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)
  results = results.append(new_row, ignore_index=True)


In [5]:
results

Unnamed: 0,Model,R2,Improvement,Reason
0,LinearRegression,0.8837,No,"Establishing a baseline with a simple, interpr..."
1,Ridge,0.8831,No,Evaluating regularization to prevent overfitti...
2,Lasso,0.8831,No,Testing feature selection capabilities via reg...
3,DecisionTreeRegressor,0.8476,No,Exploring non-linear relationships with a simp...
4,RandomForestRegressor,0.8954,Yes,Improving performance using ensemble learning ...
5,RandomForestRegressor w CrossVal,0.8656,No,a Assessing model stability and performance wi...
6,SVR,0.8361,No,Investigating the performance of Support Vecto...
7,GradientBoostingRegressor,0.9091,Yes,Enhancing prediction accuracy with gradient bo...
8,xgboost with HP Tuning,0.9262,Yes,Optimizing performance with a powerful gradien...


# Test

In [52]:
test = pd.read_csv('test.csv')

In [57]:
ID=test['Id']
ID

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [58]:
X_test_transformed = pipeline.transform(test)

In [59]:
test_predictions = new_xgb_reg.predict(X_test_transformed)

In [60]:
data={'ID':ID,'SalePrice':test_predictions}
sub=pd.DataFrame(data)
sub.to_csv('sub.csv',index=False)

In [61]:
pd.read_csv('sub.csv')

Unnamed: 0,ID,SalePrice
0,1461,122618.860
1,1462,171609.770
2,1463,179532.980
3,1464,186192.840
4,1465,179937.550
...,...,...
1454,2915,80483.200
1455,2916,78083.170
1456,2917,168832.110
1457,2918,126029.195


# Finale Pipeline with model

In [62]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        df = df.drop(columns=['Id', 'GarageConstructionYear', 'SaleMonth', 'FenceQuality', 'AdditionalFeatureValue', 'AdditionalFeature'])

        building_category_mapping = {60: 'A', 20: 'B', 70: 'C', 50: 'D', 190: 'E', 45: 'F', 90: 'G',
                                     120: 'H', 30: 'I', 85: 'J', 80: 'K', 160: 'L', 75: 'M', 180: 'N', 40: 'O'}
        df['BuildingCategory'] = df['BuildingCategory'].map(building_category_mapping)

        median_street_length = df.groupby('District')['StreetLineLength'].transform('median')
        df['StreetLineLength'] = df['StreetLineLength'].fillna(median_street_length)
        df['AlleyAccessType'] = df['AlleyAccessType'].fillna('None').astype('category')
        df['MasonrySize'] = df['MasonrySize'].fillna(0)
        df['MasonryType'] = np.where(df['MasonrySize'] == 0, 'Na', df['MasonryType'])

        basement_cols = ['BasementHeight', 'BasementCondition', 'BasementAccess', 'BasementFinish1',
                         'BasementFinishedArea1', 'BasementFinish2', 'BasementFinishedArea2', 'BasementUnfinishedArea']
        df.loc[df['TotalBasementArea'] == 0, basement_cols] = df.loc[df['TotalBasementArea'] == 0, basement_cols].fillna(
            {'BasementHeight': 'Na', 'BasementCondition': 'Na', 'BasementAccess': 'Na', 'BasementFinish1': 'Na',
             'BasementFinishedArea1': 0, 'BasementFinish2': 'Na', 'BasementFinishedArea2': 0, 'BasementUnfinishedArea': 0})

        garage_cols = ['GarageInterior', 'GarageLocation', 'GarageQuality', 'GarageCondition']
        df.loc[df['GarageSize'] == 0, garage_cols] = df.loc[df['GarageSize'] == 0, garage_cols].fillna(
            {'GarageInterior': 'Na', 'GarageLocation': 'Na', 'GarageQuality': 'Na', 'GarageCondition': 'Na'})

        df['PoolQuality'] = np.where(df['PoolSize'] == 0, 'Na', df['PoolQuality'])
        df['FireplaceQuality'] = np.where(df['FireplaceCount'] == 0, 'Na', df['FireplaceQuality'])

        df['TotalBasementBathrooms'] = df['BasementFullBathrooms'] + df['BasementHalfBathrooms']
        df = df.drop(columns=['BasementFullBathrooms', 'BasementHalfBathrooms'])

        df['TotalBathrooms'] = df['FullBathrooms'] + df['HalfBathrooms']
        df = df.drop(columns=['FullBathrooms', 'HalfBathrooms'])

        df['PropertyAge'] = df['SaleYear'] - df['ConstructionYear']
        df['RenovationTime'] = df['SaleYear'] - df['RenovationYear']
        df = df.drop(columns=['SaleYear', 'ConstructionYear', 'RenovationYear'])

        return df

categorical_features = [
    'BuildingCategory', 'ZoningClassification', 'RoadAccessType', 'AlleyAccessType', 'ParcelShape',
    'TerrainFlatness', 'UtilityAvailability', 'ParcelSettings', 'District', 'RoadProximity1',
    'RoadProximity2', 'DwellingType', 'DwellingStyle', 'RoofType', 'RoofMaterial', 'ExteriorCladding1',
    'ExteriorCladding2', 'MasonryType', 'ExteriorQuality', 'ExteriorCondition', 'FoundationType',
    'BasementHeight', 'BasementCondition', 'BasementAccess', 'BasementFinish1', 'BasementFinish2',
    'HeatingType', 'HeatingQuality', 'AirConditioning', 'ElectricalSystem', 'KitchenQuality',
    'FunctionalityRating', 'FireplaceQuality', 'GarageLocation', 'GarageInterior', 'GarageQuality',
    'GarageCondition', 'DrivewayType', 'PoolQuality', 'SaleType', 'SaleCondition'
]
ordinal_columns = [
    'TerrainSlope', 'KitchenQuality', 'ExteriorQuality', 'HeatingQuality',
    'FunctionalityRating', 'FireplaceQuality', 'ExteriorCondition',
    'BasementHeight', 'BasementCondition', 'GarageQuality', 'GarageCondition',
    'PoolQuality'
]

numerical_features = [
    'StreetLineLength', 'ParcelSize', 'MaterialQuality', 'ConditionRating', 'RenovationTime',
    'MasonrySize', 'BasementFinishedArea1', 'BasementFinishedArea2', 'BasementUnfinishedArea',
    'TotalBasementArea', 'GroundFloorArea', 'UpperFloorArea', 'LowQualityArea', 'LivingArea',
    'TotalBasementBathrooms', 'TotalBathrooms', 'BedroomAbvGr', 'KitchenAbvGr', 'TotalRooms',
    'FireplaceCount', 'GarageCapacity', 'GarageSize', 'WoodDeckArea', 'OpenPorchArea', 'EnclosedPorchArea',
    'ThreeSeasonPorchArea', 'ScreenPorchArea', 'PoolSize', 'PropertyAge'
]

ordinal_mapping = {
    'TerrainSlope': ['Sev', 'Mod', 'Gtl'],
    'KitchenQuality': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExteriorQuality': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQuality': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FunctionalityRating': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min1', 'Min2', 'Typ'],
    'FireplaceQuality': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExteriorCondition': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BasementHeight': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BasementCondition': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQuality': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCondition': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PoolQuality': ['Na', 'Fa', 'Gd', 'Ex'],
}


categorical_imputer = SimpleImputer(strategy='most_frequent')
numerical_imputer = SimpleImputer(strategy='median')

numerical_pipeline = Pipeline([
    ('imputer', numerical_imputer),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', categorical_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline([
    ('imputer', categorical_imputer),
    ('encoder', OrdinalEncoder(categories=[ordinal_mapping[col] for col in ordinal_columns]))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, [col for col in categorical_features if col not in ordinal_columns]),
        ('ord', ordinal_pipeline, ordinal_columns)
    ])

best_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective': 'reg:squarederror',
    'random_state': 42
}
new_xgb_reg = XGBRegressor(**best_params)

pipeline = Pipeline(steps=[
    ('custom', CustomTransformer()),
    ('preprocessor', preprocessor),
    ('model', new_xgb_reg)
])

pipeline.fit(X, y)

y_pred = pipeline.predict(test)


In [66]:
import pickle

with open('trained_hw_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)
