In [None]:
!pip install pycaret #preparing your data to deploying

In [None]:
import numpy as np #  a large collection of high-level mathematical functions 
import pandas as pd #  for data manipulation and analysis
pd.set_option('max_columns', 300)# for show max number of columns
pd.set_option('max_rows', 90) # for show max number rows

from sklearn.preprocessing import StandardScaler #Standardize features
from sklearn.neighbors import  KNeighborsRegressor # for classification
import scipy.stats #contains a large number of probability distributions
import matplotlib.pyplot as plt # creates a figure
import seaborn as sns #data visualization
sns.set_style('darkgrid')
from pycaret.regression import  setup, compare_models
#compare_models trains all the models in the model library using default hyperparameters and evaluates performance metrics using cross-validation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score

from catboost import CatBoostRegressor #integrate a variety of different data types
from sklearn.linear_model import BayesianRidge , HuberRegressor, Ridge, OrthogonalMatchingPursuit
#BayesianRidge :is a regression model defined in probabilistic terms, with explicit priors on the parameters
#Huber Regressor optimizes the squared loss
#Ridge is an extension of linear regression 
#OrthogonalMatchingPursuit for recovering a sparse signal from a noisy measurement encoded

from lightgbm import LGBMRegressor #gradient boosting framework that uses tree based learning algorithm.
from sklearn.ensemble import GradientBoostingRegressor #to predict a continuous value
from xgboost import XGBRegressor #Extreme Gradient Boosting

import optuna #automatic hyperparameter optimization software framework 

In [None]:
#Load train ,test and sample_submission datasets
train0 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test0 = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
#show train dataset
train0.head()

In [None]:
test0.head()

In [None]:
sample_submission.head()

# Combine train and test set

In [None]:
target = train0['SalePrice']
test_ids = test0['Id']

train1 = train0.drop(['Id','SalePrice'],axis=1)
test1 = test0.drop('Id',axis=1)

data1 = pd.concat([train1,test1],axis=0).reset_index(drop=True)
    

In [None]:
data1

In [None]:
target

# Cleaning

In [None]:
data2 = data1.copy()

In [None]:
data2['MSSubClass'] = data2['MSSubClass'].astype(str)#convert datatype to string

# Fill Catigorical missing Value

In [None]:
# Impute using a constant value
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
    
]:
    data2[column] = data2[column].fillna('None')
    
    
  # Impute using the column mode
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

In [None]:
data3 = data2.copy()

#  Numric missing value

In [None]:
def knn_imput(df, na_target):
    df = df.copy()
    
    numric_df = df.select_dtypes(np.number)#select features datatyps is number 
    non_na_columns = numric_df.loc[:,numric_df.isna().sum() == 0]#return featuers datatypes number and not null value
    
    y_train = numric_df.loc[numric_df[na_target].isna() == False, na_target]
    x_train = numric_df.loc[numric_df[na_target].isna() == False,non_na_columns]
    x_test = numric_df.loc[numric_df[na_target].isna() == True, non_na_columns ]
    
    knn = KNeighborsRegressor()
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    
    df.loc[df[na_target].isna()==True, na_target]= y_pred
    
    return df

In [None]:
#all featuers below contains null value put  in knn_imput() function to remove null values
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
    
]:
    data3 = knn_imput(data3, column)

In [None]:
data4 = data3.copy()head

# Featuer Engineering

In [None]:
data4['SqFtPerRoom'] = data4['GrLivArea'] / (data4['TotRmsAbvGrd']+
                                            data4['FullBath']+
                                            data4['HalfBath']+
                                            data4['KitchenAbvGr'])
data4['Total_Hom_Quality'] = data4['OverallQual']+ data4['OverallCond']
data4['TotalBathrooms'] = (data4['FullBath'] + (0.5*data4['HalfBath'])+data4['BsmtFullBath']+(0.5*data4['BsmtHalfBath']))
data4['HigerQualSF'] = data4['1stFlrSF'] + data4['2ndFlrSF']

In [None]:
data5 = data4.copy()

# Featuer Transformation

In [None]:
#skew use to show asymmetry in distribution of data 
skew_df = pd.DataFrame(data5.select_dtypes(np.number).columns, columns=['Featuer'])
skew_df['Skew'] = skew_df['Featuer'].apply(lambda feature:scipy.stats.skew(data5[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)



In [None]:
skew_df

In [None]:
for column in skew_df.query('Skewed == True')['Featuer'].values:
    data5[column] = np.log1p(data5[column])

# Cosine transform for Cyclical Featuers

In [None]:
data4['MoSold'] = np.cos(0.5236 * data5['MoSold'])

In [None]:
data6 = data5.copy()

# Encode Catigorical

In [None]:
#converts categorical data into dummy or indicator variables
data6 = pd.get_dummies(data6)

In [None]:
data7 = data6.copy()

# Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(data7)

data7 = pd.DataFrame(scaler.transform(data7),index=data7.index, columns=data7.columns)
data7

In [None]:
data8 = data7.copy()

# Target Transformation

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 2, 1)
sns.distplot(target, kde=True, fit=scipy.stats.norm)
plt.title('Without Log Transformation')

plt.subplot(1,2,2)
sns.distplot(np.log(target),kde=True,fit=scipy.stats.norm)
plt.xlabel('Log salePrice')
plt.title('with log transfrom')

In [None]:
log_target = np.log(target)

# Split Data

In [None]:
train_final = data8.loc[:train0.index.max(),:].copy()
test_final = data8.loc[train0.index.max()+1:,:].reset_index(drop=True).copy()
train_final

In [None]:
test_final

In [None]:
log_target

# Model Selection

In [None]:
#_ =setup(data=pd.concat([train_final, log_target], axis=1), target='SalePrice')

In [None]:
#compare_models()

# Baseline Model

In [None]:
basline_model = CatBoostRegressor(verbose=0)

In [None]:
basline_model.fit(train_final, log_target)

In [None]:
kf = KFold(n_splits=10)
result = cross_val_score(basline_model, train_final, log_target,scoring='neg_mean_squared_error',cv=kf)

In [None]:
-result

In [None]:
np.exp(np.sqrt(np.mean(-result)))

In [None]:
plt.figure(figsize=(16,10))
sns.displot(-result,bins=10, kde=True)

# HyperParaneter Optimazation

In [None]:
def br_objective(trial):
    n_iter = trial.suggest_int('n_iter',50, 600)
    tol = trial.suggest_loguniform('tol',1e-8, 10.0)
    alpha_1 = trial.suggest_loguniform('alpha_1',1e-8, 10.0)
    alpha_2 = trial.suggest_loguniform('alpha_2',1e-8, 10.0)
    lambda_1 = trial.suggest_loguniform('lambda_1',1e-8, 10.0)
    lambda_2 = trial.suggest_loguniform('alpha_2',1e-8, 10.0)
    
    model = BayesianRidge(
        n_iter=n_iter,
         tol=tol,
         alpha_1=alpha_1,
         alpha_2=alpha_2,
         lambda_1=lambda_1,
         lambda_2=lambda_2
     )
    model.fit(train_final, log_target)
    cv_scores = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target
                                                    , scoring='neg_mean_squared_error', cv=kf)))
    return np.mean(cv_scores)
    

In [None]:
#study = optuna.create_study(direction='minimize')
#study.optimize(br_objective, n_trials=100)

In [None]:
#study.best_params

# Bagging Ensemble

In [None]:
catboost_params={
 'iterations': 6000,
    'learning_rate': 0.005,
    'depth': 4,
    'l2_leaf_reg': 1,
    'eval_metric':'RMSE',
    'early_stopping_rounds': 200,
    'random_seed': 42
       
}
bar_params={
    'n_iter': 453,
 'tol': 0.03903603191164772,
 'alpha_1': 5.310640749773629e-05,
 'alpha_2': 2.2966081860670693e-06,
 'lambda_1': 9.99634986180216
 
    
}
light_params={
    
     'num_leaves': 39,
    'max_depth': 2,
    'learning_rate': 0.13705339989856127,
    'n_estimators': 273
  
}

ridge_params = {
    'alpha': 631.1412445239156
}

In [None]:
models={
    'catboost':CatBoostRegressor(**catboost_params, verbose=0),
    'br':BayesianRidge(**bar_params),
    'lightgbm':LGBMRegressor(**light_params),
    'ridge':Ridge(**ridge_params),
    'omp':OrthogonalMatchingPursuit()
    
}

In [None]:
for name, model in models.items():
    model.fit(train_final, log_target)
    print(name+'trained.')

# Evaluate

In [None]:
results = {}
kf = KFold(n_splits=10)
for name,model in models.items():
    result = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error',cv=kf)))
    results[name] = result


In [None]:
results

In [None]:
for name, result in results.items():
    print('__________________\n',name)
    print(np.mean(result))
    print(np.std(result))

In [None]:
final_predictions = (
    0.4 * np.exp(models['catboost'].predict(test_final)) +
    0.2 * np.exp(models['br'].predict(test_final)) +
    0.2 * np.exp(models['lightgbm'].predict(test_final)) +
    0.1 * np.exp(models['ridge'].predict(test_final)) +
    0.1 * np.exp(models['omp'].predict(test_final))
)

In [None]:
final_predictions

In [None]:
submission = pd.concat([test_ids, pd.Series(final_predictions, name='SalePrice')], axis=1)

In [None]:
submission

In [None]:
submission.to_csv('./submission.csv',index=False,header=True)