In [None]:
import pandas as pd
import numpy as np
import seaborn as sbs
import matplotlib.pyplot as plt

In [None]:
train_data =  pd.read_csv('train.csv')
test_data = pd.read_csv("test.csv")

# Feature Engineering

Remove Null Values

In [None]:
train_data.isnull().sum()[train_data.isnull().sum() > 0]

Unnamed: 0,0
LotFrontage,259
Alley,1369
MasVnrType,872
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1


In [None]:
num_cont = ["LotFrontage", "MasVnrArea", "GarageYrBlt"]

for col in num_cont:
    mean_value = train_data[col].mean()   # mean from train only
    train_data[col].fillna(mean_value, inplace=True)
    test_data[col].fillna(mean_value, inplace=True)  # use same mean



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(mean_value, inplace=True)  # use same mean


In [None]:
train_data.isnull().sum()[train_data.isnull().sum() > 0]

Unnamed: 0,0
Alley,1369
MasVnrType,872
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1
FireplaceQu,690
GarageType,81


Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in train_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col].astype(str))

    # Handle unseen labels in test set
    test_data[col] = test_data[col].astype(str)
    test_data[col] = test_data[col].apply(lambda x: x if x in le.classes_ else 'Unknown')

    # Add "Unknown" as a new possible class
    le.classes_ = np.append(le.classes_, 'Unknown')
    test_data[col] = le.transform(test_data[col])

    label_encoders[col] = le
train_data["YrSold"] =  le.fit_transform(train_data["YrSold"])
test_data["YrSold"] =  le.transform(test_data["YrSold"])

In [None]:
train_data.isnull().sum()[train_data.isnull().sum() > 0]

Unnamed: 0,0


In [None]:
#train_data.to_excel("standard.xlsx", index=False)

Standard Encoding

In [None]:
to_standardize = [
    "LotArea","MasVnrArea","BsmtFinSF1","BsmtUnfSF",
    "LotFrontage", "LotArea", "GrLivArea", "1stFlrSF",
    "TotalBsmtSF", "GarageArea", "OpenPorchSF", "WoodDeckSF",
    "2ndFlrSF","EnclosedPorch","MiscVal"
]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data[to_standardize] = scaler.fit_transform(train_data[to_standardize])
test_data[to_standardize] = scaler.transform(test_data[to_standardize])

In [None]:
train_data.to_excel("standardation.xlsx", index=False)

suuu

In [None]:
cols_to_check = [
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF",
    "BsmtFullBath", "BsmtHalfBath", "GarageCars", "GarageArea"
]


for col in cols_to_check:
    mean_value = train_data[col].mean()   # mean from train only
    test_data[col].fillna(mean_value, inplace=True)  # use same mean

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(mean_value, inplace=True)  # use same mean


In [None]:
test_data.isnull().sum()[test_data.isnull().sum() > 0]

Unnamed: 0,0


In [None]:
test_data.to_excel("standard_test.xlsx", index=False)

Splitting the Data

In [None]:
X_train = train_data.iloc[:,1:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,1:]

In [None]:
#y_train = y_train.reshape(-1, 1)

Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# parameters ={
#     "criterion"    : ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
#     "max_depth"    : [2,6,8,10],
#     "ccp_alpha"    : [0,1,4,7,10,50,100]

# }

parameters = { "n_estimators": [5,10,20,30],
    "max_depth": [5,6,7,8,9],
    "criterion": ['squared_error', 'poisson'],
    "ccp_alpha"    : [0,1,4,7,10]
 }

In [None]:
treemodel = RandomForestRegressor()
cv = GridSearchCV(treemodel,param_grid = parameters,cv=4,scoring='r2')

In [None]:
cv.fit(X_train,y_train)

In [None]:
treemodel = RandomForestRegressor();
treemodel.fit(X_train,y_train)

In [None]:
treemodel.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
print(cv.best_params_)
print(cv.best_score_)

{'ccp_alpha': 0, 'criterion': 'poisson', 'max_depth': 5, 'n_estimators': 30}
0.8370641024000863


AttributeError: 'NoneType' object has no attribute 'isnull'

In [None]:
y_pred = cv.predict(X_test)


In [None]:
import pandas as pd

# Assuming y_pred is a numpy array
submission = pd.DataFrame({
    "Id": range(1461, 2920),  # 2920 because end is exclusive
    "SalePrice": y_pred
})

submission.to_csv("submission_finale_randomforest.csv", index=False)
print("✅ File saved as submission.csv")


✅ File saved as submission.csv


In [None]:
df_pred = pd.DataFrame({
    'Id': X_test.index,
    'SalePrice': y_pred
})

df_pred.to_excel('Final_submission.xlsx', index=False)


In [None]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("Final_submission.xlsx")

# Add ID column from 1461 to 2919
df['ID'] = range(1461, 2920)  # 2920 because range() is exclusive at the end

# Save back to a new Excel file
df.to_excel("Final_submission_with_ID.xlsx", index=False)




# XG Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
# regressor = GradientBoostingRegressor();
# regressor.fit(X_train,y_train)
#y_pred_xg = regressor.predict(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
parameters = {
    "n_estimators": [100, 200, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 4, 5, 6],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "subsample": [0.6, 0.8, 1.0],
    "max_features": ["auto", "sqrt", "log2"]
}


In [None]:
regressor = GradientBoostingRegressor();

In [None]:
from sklearn.model_selection import RandomizedSearchCV

cv = RandomizedSearchCV(
    regressor,
    param_distributions=parameters,
    n_iter=30,
    cv=4,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=2
)


In [None]:
#cv = GridSearchCV(regressor,param_grid = parameters,cv=4,scoring='r2')
cv.fit(X_train,y_train)
print(cv.best_params_)
print(cv.best_score_)

Fitting 4 folds for each of 30 candidates, totalling 120 fits


24 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

{'subsample': 0.6, 'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 6, 'learning_rate': 0.05}
0.8886285626753343


In [None]:
y_pred_xg = cv.predict(X_test)

In [None]:
import pandas as pd
import numpy as np

# Average predictions elementwise
final_pred = (y_pred + y_pred_xg) / 2

# Create submission DataFrame
submission = pd.DataFrame({
    "Id": range(1461, 2920),  # since end is exclusive
    "SalePrice": final_pred
})

# Save to CSV
submission.to_csv("xgboost_tuning.csv", index=False)
print("✅ File saved as submission_finale_xgboost_rf_ensemble.csv")


✅ File saved as submission_finale_xgboost_rf_ensemble.csv


In [None]:
import pandas as pd

# Assuming y_pred is a numpy array
submission = pd.DataFrame({
    "Id": range(1461, 2920),  # 2920 because end is exclusive
    "SalePrice": y_pred_xg
})

submission.to_csv("submission_finale_xgboost_rf_ensemble.csv", index=False)
print("✅ File saved as submission.csv")

✅ File saved as submission.csv
