In [1]:
import pandas as pd 
import numpy as np 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('./train.csv')

In [3]:
df.shape

(1460, 81)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# Split the DataFrame into features (X) and target (y)
X = df.drop(['SalePrice'], axis=1)  # Drop the target column to create features
y = df['SalePrice']                   # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [5]:

one_hot_encoding_cols = [
    'MSZoning', 'Alley', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle',
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'Exterior2nd', 'Foundation',
    'Heating', 'CentralAir', 'GarageType', 'Fence', 'MiscFeature', 'Electrical'
]
ordinal_encoding_cols = [
    'Street', 'LotShape', 'LandContour', 'LotConfig', 'Utilities', 'Condition1', 
    'Condition2', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 
    'GarageFinish', 'GarageQual', 'PoolQC'
]

df_numeric = df.drop(['Id', 'SalePrice'], axis = 1).select_dtypes(exclude='object').columns
# df_object = df.select_dtypes(include='object').columns

# df_object.isin(one_hot_encoding_cols).sum()

df_numeric


Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [6]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [7]:
one = OneHotEncoder(sparse_output=False)
odi = OrdinalEncoder()

In [8]:
from sklearn.compose import ColumnTransformer

In [9]:
# processing pipeline for numerical columns
num_cat = Pipeline(
    steps = [('imputation_mean', SimpleImputer(missing_values= np.nan, strategy='mean')),
            ]
)

In [10]:
# preprocessin for categorical columns
odinal_cat = Pipeline(
    steps = [
            ('imp_con_odi', SimpleImputer(fill_value= 'None', strategy='constant')),
            ('odinal', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value= -1, encoded_missing_value = -1))
    ]
)

In [11]:
onehot_cat = Pipeline(
    steps = [
            ('imp_con_one', SimpleImputer(fill_value= 'None', strategy='constant')), 
            ('oneHot', OneHotEncoder(handle_unknown='ignore')),
    ]
)

In [12]:
# num_cat

In [13]:
# odinal_cat

In [14]:
# onehot_cat

In [20]:
preprocessor = ColumnTransformer(
                    [('numeri_catl', num_cat, df_numeric),
                    ('odinal_cat', odinal_cat, ordinal_encoding_cols),
                    ('onehot_cat', onehot_cat, one_hot_encoding_cols),], 
                    remainder= 'passthrough',
                    force_int_remainder_cols=False
            )

In [28]:
preprocessor

In [29]:
preprocessor.fit_transform(X_train)

array([[50.0, 86.0, 11500.0, ..., 'Y', 'WD', 'Normal'],
       [20.0, 67.0, 16285.0, ..., 'Y', 'WD', 'Normal'],
       [85.0, 60.0, 7200.0, ..., 'Y', 'WD', 'Normal'],
       ...,
       [20.0, 60.0, 8172.0, ..., 'N', 'WD', 'Normal'],
       [50.0, 55.0, 7642.0, ..., 'Y', 'WD', 'Normal'],
       [120.0, 53.0, 3684.0, ..., 'Y', 'WD', 'Normal']], dtype=object)

In [22]:
# pandas_df = preprocessor.fit_transform(df)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRFRegressor
from sklearn import set_config

mmx = MinMaxScaler

rfr = RandomForestRegressor()
dtf = DecisionTreeRegressor()
xgb = XGBRFRegressor()
set_config(display='diagram')

In [23]:
# Define parameter grid for each model
param_grid = [
    {
        'model': [rfr],
        'model__n_estimators': [10, 20, 30, 50, 100, 150],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__max_features': ['sqrt', 'log2']
    },
    {
        'model': [dtf],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__splitter': ['best', 'random'],
        'model__max_features': ['sqrt', 'log2']
    },
    {
        'model': [xgb],
        'model__n_estimators': [50, 100, 150],
        'model__max_depth': [3, 6, 9],
        'model__learning_rate': [0.01, 0.1, 0.2]
    }
]

# Define the pipeline with a placeholder model
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', mmx),  # Replace mmx with your scaler (e.g., StandardScaler())
    ('model', rfr)  # Placeholder, GridSearchCV will replace this
])
pipe

In [24]:
# Set up GridSearchCV with error_score='raise' for detailed debugging if any fit fails
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=2,
    error_score='raise'
)

# Fitting the model (replace X_train and y_train with your training data)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 315 candidates, totalling 1575 fits


ValueError: could not convert string to float: 'Typ'

In [None]:
y_pred = grid.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_error
def matrices_regression(y_test, y_pred):
    print('MSE', mean_squared_error(y_test, y_pred))
    
    print('MAE', mean_absolute_error(y_test, y_pred))
    
    
    # print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    
    print('R2 Score', r2_score(y_test, y_pred))
    
    n= len(X_train)
    p = len(X_train.columns)
    adj_R2 = 1- ((1-r2_score(y_test, y_pred)) * (n-1)/(n-p-1))
    print('Adjusted R2 Score', adj_R2 )



In [None]:
 # matrices_regression(y_test, y_pred)

In [None]:
df_test = pd.read_csv('./test.csv')

In [None]:
sumbission = df_test[['Id']].copy()

In [None]:
# df_test = df_test.drop(['Id'], axis=1)

In [None]:
# test_pred = grid.predict(df_test)

In [None]:
# sumbission['SalePrice'] = test_pred

In [None]:
len(grid.best_estimator_.feature_names_in_
)

In [None]:
sumbission

In [None]:
sumbission.to_csv('submission.csv', index=False)

In [None]:
best_model = grid.best_estimator_
best_params = grid.best_params_
print("Best Model:", best_model)
print("Best Parameters:", best_params)