In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

sns.set()


In [None]:
full_data=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv", index_col= 'Id')
test_data=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv", index_col= 'Id')

full_data.shape, test_data.shape

# Data Exploration


In [None]:
full_data.head() 

In [None]:
full_data.info()

In [None]:
unusual_null_data=['nan', "NAN", "NA", "NULL", {}, [], "?", ".", "-", "_", "", " ", "  "]

for column in full_data.columns:
    strange_null = np.array([x in unusual_null_data for x in full_data[column]])
    print(column, full_data[column].isna().sum(), strange_null.sum())

In [None]:
missing = full_data.isna().sum()

px.bar((missing[missing >0]).sort_values(), title="Null Values per feature")

As we can see there are many features with many null values like [PoolQC, Alley, ..., etc],
before making any descision lets spearate them into numerical & Categorical features then continue our investigation

In [None]:
px.box(full_data, y="SalePrice")

In [None]:
plt.rc("figure", figsize=(16, 8))
sns.histplot(full_data.SalePrice, kde=True);

In [None]:
full_data['SalePrice'].quantile([0.25, 0.75])
1.5*(214000 - 129975) +214000

In [None]:
full_data.drop(index = full_data[full_data['SalePrice']>=340000].index, inplace=True)

In [None]:
plt.rc("figure", figsize=(16, 8))
sns.histplot(full_data.SalePrice, kde=True);

In [None]:
X_full = full_data.copy()
y = full_data['SalePrice']
X_full.drop('SalePrice', axis=1, inplace=True)

In [None]:
numerical_cols = [cname for cname in X_full.columns if X_full[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

my_columns= numerical_cols+categorical_cols
X_full = X_full[my_columns].copy()
X_test = test_data[my_columns].copy()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='no_feature')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_full = preprocessor.fit_transform(X_full)
X_test = preprocessor.transform(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y,train_size=0.8, random_state=42)

learning_rate = np.arange(0.1,0.5,0.01)
RMSE_validation = []
RMSE_train = []
for i in learning_rate:
    
    model_GradBoos = GradientBoostingRegressor(n_estimators=60, random_state=32,max_depth=2, learning_rate=i)




    model_GradBoos.fit(X_train, y_train)
    preds1 = model_GradBoos.predict(X_valid)
    preds2=model_GradBoos.predict(X_train)

    RMSE_validation.append(mean_squared_error(np.log(y_valid), np.log(preds1), squared=False))
    RMSE_train.append(mean_squared_error(np.log(y_train), np.log(preds2), squared=False))

In [None]:
plt.xlabel('learning_rate')
plt.ylabel('log(RMSE)')
plt.title('learning_rate VS (RMSE_validation & RMSE_train)')

sns.lineplot(x=learning_rate, y=RMSE_validation)
sns.lineplot(x=learning_rate, y=RMSE_train);

In [None]:
from xgboost import XGBRegressor

learning_rate = np.arange(20,500,5)
RMSE_validation = []
RMSE_train = []

for i in learning_rate:
    
    model_xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=4,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=i, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)




    model_xgb.fit(X_train, y_train)
    preds1 = model_xgb.predict(X_valid)
    preds2=model_xgb.predict(X_train)

    RMSE_validation.append(mean_squared_error(np.log(y_valid), np.log(preds1), squared=False))
    RMSE_train.append(mean_squared_error(np.log(y_train), np.log(preds2), squared=False))

In [None]:
plt.xlabel('n_estimators')
plt.ylabel('log(RMSE)')
plt.title('n_estimators VS (RMSE_validation & RMSE_train)')

sns.lineplot(x=learning_rate, y=RMSE_validation)
sns.lineplot(x=learning_rate, y=RMSE_train);

In [None]:
model_xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=2,
             min_child_weight=0.5, monotone_constraints='()',
             n_estimators=125, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)



model_xgb.fit(X_train, y_train)
preds1 = model_xgb.predict(X_valid)
preds2=model_xgb.predict(X_train)

print("RMSE validation: " , mean_squared_error(np.log(y_valid), np.log(preds1), squared=False))
print("RMSE train:      " , mean_squared_error(np.log(y_train), np.log(preds2), squared=False))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y,train_size=0.8, random_state=42)

model_GradBoos = GradientBoostingRegressor(n_estimators=75, random_state=32,max_depth=2, learning_rate=0.19)

model_GradBoos.fit(X_train, y_train)
preds1 = model_GradBoos.predict(X_valid)
preds2=model_GradBoos.predict(X_train)

print("RMSE validation: " , mean_squared_error(np.log(y_valid), np.log(preds1), squared=False))
print("RMSE train:      " , mean_squared_error(np.log(y_train), np.log(preds2), squared=False))

In [None]:
preds_test = model_xgb.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_data.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)