In [39]:
import pandas as pd 
import numpy as np 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [40]:
df = pd.read_csv('./train.csv')

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# Split the DataFrame into features (X) and target (y)
X = df.drop(['SalePrice', 'Id'], axis=1)  # Drop the target column to create features
y = df['SalePrice']                   # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [62]:
y_test

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
1010    135000
390     119000
1409    215000
847     133500
1284    169000
Name: SalePrice, Length: 482, dtype: int64

In [63]:
numerical_col = ['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','YrSold',]
one_hot_encoding_cols = [
    'MSZoning', 'Alley', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle',
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'Exterior2nd', 'Foundation',
    'Heating', 'CentralAir', 'GarageType', 'Fence', 'MiscFeature', 'Electrical'
]
ordinal_encoding_cols = [
    'Street', 'LotShape', 'LandContour', 'LotConfig', 'Utilities', 'Condition1', 
    'Condition2', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 
    'GarageFinish', 'GarageQual', 'PoolQC'
]
pt_list =[]

In [64]:
def separate_columns_by_dtype(df):
    int_columns = df.select_dtypes(include=['int64']).columns.tolist()
    float_columns = df.select_dtypes(include=['float64']).columns.tolist()
    object_columns = df.select_dtypes(include=['object']).columns.tolist()

    return int_columns, float_columns, object_columns

In [65]:
int_columns, float_columns, object_columns = separate_columns_by_dtype(df)

In [66]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [67]:
one = OneHotEncoder(sparse_output=False)
odi = OrdinalEncoder()

In [68]:
from sklearn.compose import ColumnTransformer

In [69]:
# processing pipeline for numerical columns
numeric_processor = Pipeline(
    steps = [('imputation_mean', SimpleImputer(missing_values= np.nan, strategy='mean')),
            ('min_max_scaler', MinMaxScaler())
            ]
)

In [70]:
# preprocessin for categorical columns
odinal_cat = Pipeline(
    steps = [
            ('odinal', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value= -1, encoded_missing_value = -1))
    ]
)

In [71]:
onehot_cat = Pipeline(
    steps = [
            ('imputation_constant', SimpleImputer(fill_value= 'None', strategy='constant')), 
            ('oneHot', OneHotEncoder(handle_unknown='ignore')),
    ]
)

In [72]:
numeric_processor

In [73]:
odinal_cat

In [74]:
onehot_cat

In [75]:
preprocessor = ColumnTransformer(
                    [('numerical', numeric_processor, numerical_col),
                    ('odinal_cat', odinal_cat, ordinal_encoding_cols),
                    ('onehot_cat', onehot_cat, one_hot_encoding_cols),], 
                    remainder="drop")

In [76]:
# preprocessor.fit_transform(X_train)

In [77]:
# pandas_df = preprocessor.fit_transform(df)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRFRegressor
from sklearn import set_config

rfr = RandomForestRegressor()
dtf = DecisionTreeRegressor()
xgb = XGBRFRegressor()
set_config(display='diagram')

In [78]:
pipe = Pipeline(steps = [
    ('preprocessor', preprocessor), ('regressor', rfr)
])

# pip

In [79]:
# pipe.fit(X_train, y_train)

In [80]:
# pipe.predict(X_test)

In [81]:
param_grid = [{
    'regressor__n_estimators': [200, 500],  # Number of trees
    # 'regressor__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split

}]

In [82]:
grid = GridSearchCV(
    pipe, param_grid= param_grid, cv = 5, 
)

In [83]:

grid.fit(X_train, y_train)

y_pred =grid.predict(X_test)


In [84]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def matrices_regression(y_test, y_pred):
    print('MSE', mean_squared_error(y_test, y_pred))
    
    print('MAE', mean_absolute_error(y_test, y_pred))
    
    
    # print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    
    print('R2 Score', r2_score(y_test, y_pred))
    
    n= len(X_train)
    p = len(X_train.columns)
    adj_R2 = 1- ((1-r2_score(y_test, y_pred)) * (n-1)/(n-p-1))
    print('Adjusted R2 Score', adj_R2 )


In [85]:
 matrices_regression(y_test, y_pred)

MSE 924810404.1688384
MAE 17553.022427385895
Root Mean Squared Error (RMSE): 30410.695555492286
R2 Score 0.874027911230417
Adjusted R2 Score 0.8629457341560327


In [96]:
df_test = pd.read_csv('./test.csv')

In [98]:
sumbission = df_test['Id'] 

In [88]:
# df_test = df_test.drop(['Id'], axis=1)

In [90]:
test_pred = grid.predict(df_test)

In [93]:
test_pred.shape

(1459,)

In [99]:
sumbission['SalePrice'] = test_pred

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sumbission['SalePrice'] = test_pred


In [100]:
sumbission

0                                                         1461
1                                                         1462
2                                                         1463
3                                                         1464
4                                                         1465
                                   ...                        
1455                                                      2916
1456                                                      2917
1457                                                      2918
1458                                                      2919
SalePrice    [126216.5, 155434.75, 185851.84, 184581.685, 2...
Name: Id, Length: 1460, dtype: object