In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Work flow
It is always helpful to structure your work first. Details could be adjusted when you found some insights in data.
1. EDA
2. Feature Engineering
3. Preprocessing for modeling
4. Build model

> # 1. EDA

# 1.1 Get General Idea About the Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
print(train.head())
print(test.head())

In [None]:
print('train shape',train.shape)
print('test shape',test.shape)
print('duplicated rows',train.duplicated().sum())
print('columns containing missing values',train.isnull().any().sum())

In [None]:
test['SalePrice'] = np.nan
data_all = pd.concat([train,test],ignore_index=True)
print('merged shape ',data_all.shape)
print(data_all.info())

> Notes that it is important to distinguish the numerical variables from the categorical ones. 
* Here are some key takeaways:
1. The dtype only give us a big picture about the data are stored. However, it is still important to see whether there are some 'int' data is actually ordinal or nominal. For example, in this case, 'Id' should be nominal (or not consider as a feature), 'MSSubClass' can be ordinal or nominal depending on how you analyse it.
2. The corresponding visualization and analysis techniques can be different. For example, for numerical data, it is easy to compute the pearson correlation. On the other hand, using categories to draw different histogram is a nice way to show the distribution of our interested variable among the categories.
3. Here we use dtype == ['int','float'] to find numerical variables and 'object' to find categories. Numerical variables can be further split into "continuous" ones such as height, and "interval" ones such as decibel. Categorical variable can be split into "ordinal" and "nominal". Sometimes, ordinal ones can be translate into numbers to show their ranking. Nominal ones may need to be processed by one-hot encoding, therefore leads to undesired high dimension. So dealing with High Cardinality nominal variables is also an important part in feature engineering.
4. Time is an interesting feature, it would be helpful to store it as datatime64. However, first drawing out some lineplot against time would be nice to decide how to deal with it.

In [None]:
#This is a nice code to use at the very beginning
data_object = data_all.select_dtypes('object')
print('object shape ',data_object.shape)
data_num = data_all.select_dtypes(['int64','float64'])
print('num shape ',data_num.shape)

In [None]:
data_num.describe()

In [None]:
for i in data_object.columns:
    print(data_object[i].value_counts())

In [None]:
#Observed that some int features are not necessary continuous variable
#Should have checked first 

In [None]:
int_col = train.select_dtypes('int64').columns
def show_sample(column_list,row_range_start=0):
    i = 0
    while i < len(column_list):
        try:
            print(train[column_list[i:i+10]].iloc[row_range_start:row_range_start+10])
            i += 10
        except:
            print(train[column_list[i:]].iloc[row_range_start:row_range_start+10])
show_sample(int_col)

#  1.1.1 Some little feature engineering

In my first trial, I didn't notice that this processing could be done first. But in this case, it would be more clear to show EDA with some feature processed.

In [None]:
discrete_int = ['Id','MSSubClass']
time_int = ['YrSold','YearBuilt','YearRemodAdd','MoSold']

In [None]:
train[discrete_int] = train[discrete_int].astype('object')
data_all[discrete_int] = data_all[discrete_int].astype('object')

In [None]:
def get_num_features(df):
    num = list(df.select_dtypes(['int64','float64']).columns)
    try:
        num.remove('SalePrice')
    except:
        pass
    return num
def get_cat_features(df):
     return list(df.select_dtypes('object').columns)

In [None]:
num_train = train.select_dtypes(['int64','float64'])
object_train = train.select_dtypes('object')
num_train_corr = num_train.corr()

In [None]:
num_all = data_all.select_dtypes(['int64','float64'])
object_all = data_all.select_dtypes('object')
num_all_corr = num_all.corr()

#  1.2 EDA for Numerical Data

In [None]:
fig,ax=plt.subplots(figsize=(20,20))
sns.heatmap(num_all_corr, cmap='Reds')
plt.show()
fig,ax=plt.subplots(figsize=(20,20))
sns.heatmap(num_train_corr, cmap='Reds')
plt.show()
#1.Observed that OverallQual is correlated w/ lots of others
#2.As well as TotalBsmtSF, 1stFlrSF, GrLivArea
#3.Similar effects also happens to GarageYrBlt/Cars/Area
#and it's obvious that they are highly correlated with each other
#Lastly let's focus on the features that have the highest correlation with SalesPrice



In [None]:
#rank the sign and correlationship
Correlation = pd.DataFrame(num_train.corr()['SalePrice'])
Correlation['Abs'] = np.abs(Correlation['SalePrice'])
Correlation = Correlation.sort_values(by='Abs',ascending=False)
#Most of the features are 'Positively' correlated with SalePrice
#Now use 0.5 as a threshold to pick out the important ones
important_features_CC = list(Correlation[Correlation['Abs'] > 0.5].index)
important_features_CC.remove('SalePrice')
print(important_features_CC)
#So now let's take a look at the details for the 10 mentioned ones in these sections
fig,ax=plt.subplots(figsize=(20,20))
sns.set(font_scale=1.5)
sns.heatmap(train[important_features_CC+['SalePrice']].corr(),annot=True,annot_kws={"size": 20})

In [None]:
#Since there are way too many variables with distinct definition
#We can start by learning some background knowledge of the data through these features.
#Here are some of the summary I would make at the beginning of EDA
data_all['GarageCars'].describe()
data_all['FullBath'].describe()
data_all['TotRmsAbvGrd'].describe()
data_all['YearBuilt'].describe()
data_all['YearRemodAdd'].describe()
# 
# 'OverallQual', 1-10 ratings summarizing the house
# 'GrLivArea', above ground area(feet^2)
# 'GarageCars', cars capacity(0-5)
# 'GarageArea', garage size area
# 'TotalBsmtSF', basement area(feet^2)
# '1stFlrSF', 1 flr area(feet^2)
# 'FullBath', amount of bathrooms above ground(1-4)
# 'TotRmsAbvGrd', amount of rooms above ground(2-15)
# 'YearBuilt', construction date,1872-2010
# 'YearRemodAdd', remodel date, 1950-2010
# 

# 1.3 EDA for Categorical Data

In [None]:
#histograms helps observed the distribution difference within groups
object_train['SalePrice'] = train['SalePrice']
sns.set(font_scale=1)
plt.rcParams["figure.figsize"] = (10,6)

for i in object_train.columns:
    if i in ['Id','SalePrice']:
        pass
    else:
        categories = object_train[i].unique()
        print('Categories for',i,":",len(categories))
        sns.countplot(x=i,data=object_train)
        plt.title(i)
        plt.show()
        for j in categories:
            plt.hist(object_train[object_train[i] == j]['SalePrice'],alpha=0.5,label=j)
        plt.legend(loc='upper right')
        plt.show()

In [None]:
sns.displot(train['SalePrice'])
plt.show()
#SalePrice itself is right-skewed

> 1.4 Observe Time-related features

In [None]:
#Focus on Time related features
def draw_time(data,time_feature,y='SalePrice'):
    frame_mean = data.groupby(time_feature)[y].mean()
    frame_count =  data.groupby(time_feature)[y].count()
    sns.lineplot(x=frame_mean.index,y=frame_mean)
    plt.title('Mean '+y+' Against '+time_feature)
    plt.show()
    sns.lineplot(x=frame_count.index,y=frame_count)
    plt.title('Count '+y+' Against '+time_feature)
    plt.show()

In [None]:
data_all['Sold_time'] = data_all['YrSold'].astype(str)+'/'+data_all['MoSold'].astype(str)
data_all['Sold_time'] = pd.to_datetime(data_all['Sold_time'] , format='%Y/%m')
data_all['MoSold'] = data_all['MoSold'].astype('object')

In [None]:
draw_time(data=data_all,time_feature='Sold_time')
draw_time(data=data_all,time_feature='YearBuilt')
draw_time(data=data_all,time_feature='YearRemodAdd')
#seasonality found in sold time and price
#trend found in remodel time and price

> # 2. Feature Engineering

> FEATURE ENGINEERING PROCESS
1. For Categorical('Object')
* Ordinal: change to int and consider scale
* Nominal: Target the high cardinality one's and try to reduce dimension, one hot encoding, 
2. For numerical('int','float')
* Change some 'int' into 'object' if it's actually ordinal or nominal (Shown in EDA)
* Continous: see whether adjust the skewness, standardize
* Interval: normalize
* Time: seasonality, trend
3. Deal with missing value
4. Deal with ourliers
5. Conduct the needed transformation mentioned above

> Feature Selection
* List out the ones that contribute nothing in EDA
* First tryout the ones seem promising
* Deal with multicolinearity or real-world relationship(Domain knowledge needed)

# 2.1 Feature Engineering for Categorical Features

In [None]:
#Change Ordinal features into numbers
data_all['ExterQual'] = data_all['ExterQual'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
data_all['ExterCond'] = data_all['ExterCond'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
data_all['BsmtQual'] = data_all['BsmtQual'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0})
data_all['BsmtCond'] = data_all['BsmtCond'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0})
data_all['BsmtExposure'] = data_all['BsmtExposure'].replace({'Gd':4,'Av':3,'Mn':2,'No':1,'NA':0})
data_all['HeatingQC'] = data_all['HeatingQC'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
data_all['KitchenQual'] = data_all['KitchenQual'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
data_all['FireplaceQu'] = data_all['FireplaceQu'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0})
data_all['GarageFinish'] = data_all['GarageFinish'].replace({'Fin':3,'RFn':2,'Unf':1,'NA':0})
data_all['GarageQual'] = data_all['GarageQual'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0})
data_all['GarageCond'] = data_all['GarageCond'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0})
data_all['PavedDrive'] = data_all['PavedDrive'].replace({'Y':3,'P':2,'N':1})
data_all['PoolQC'] = data_all['PoolQC'].replace({'Ex':4,'Gd':3,'TA':2,'Fa':1,'NA':0})
data_all['Fence'] = data_all['Fence'].replace({'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'NA':0})

In [None]:
#pickout the categrorical features with high cardinality
high_card_col_cat = [i for i in get_cat_features(data_all) if len(data_all[i].unique()) >= 10]
high_card_col_cat.remove('Id')

In [None]:
#build some new features based on our findings in EDA or domain
from scipy import stats
stats.pearsonr(data_all['ExterQual'],data_all['ExterCond'])
#those two are independent variables
#suggest that these two may have interaction effects
data_all['OverallValue'] = data_all['OverallQual'] * data_all['OverallCond']
data_all['ExterValue'] = data_all['ExterQual'] * data_all['ExterCond']
data_all['BsmtQual'] = data_all['BsmtQual'] * data_all['BsmtCond']
data_all['GarageValue'] = data_all['GarageQual'] * data_all['GarageCond']
data_all['TotalArea'] = data_all['TotalBsmtSF'] + data_all['1stFlrSF'] + data_all['2ndFlrSF']

In [None]:
print(data_all.groupby('MSSubClass')['SalePrice'].mean())
print(data_all.groupby('Exterior1st')['SalePrice'].mean())
print(data_all.groupby('Exterior2nd')['SalePrice'].mean())
MSSubClass_Stories = {'20':1,'30':1,'40':1,'45':1.5,'50':1.5,'60':2,'70':2,'75':2.5,'120':1,'150':1.5,'160':2}
MSSubClass_Ages = {'20':1,'30':0,'40':0.5,'45':0.5,'50':0.5,'60':1,'70':0,'75':0.5,'90':0.5,'120':1,'150':0.5,'160':1}
MSSubClass_Other = []
def get_MSSubClass_Stories(data):
    if str(data) in MSSubClass_Stories.keys():
        return MSSubClass_Stories[str(data)]
    else:
        return 2
def get_MSSubClass_Ages(data):
    if str(data) in MSSubClass_Ages.keys():
        return MSSubClass_Ages[str(data)]
    else:
        return 0.5


data_all['Stories'] = data_all['MSSubClass'].apply(get_MSSubClass_Stories)
data_all['Ages'] = data_all['MSSubClass'].apply(get_MSSubClass_Ages)


In [None]:
#Use data from outside, outside data can often help making decision if possible
#this part is inspired by Mustafa Cicek, check out his amazing notebook as well:)

geo = {"North":["Blmngtn", "BrDale", "ClearCr", "Gilbert",  "Names", "NoRidge", "NPkVill", 
           "NoRidge", "NridgHt", "Sawyer", "Somerst", "StoneBr", "Veenker", "NridgHt"],

"South":["Blueste", "Edwards", "Mitchel", "MeadowV", "SWISU", "IDOTRR", "Timber"],
"Downtown":["BrkSide", "Crawfor", "OldTown", "CollgCr"],
"West":["Edwards", "NWAmes", "SWISU", "SawyerW"]}
def find_geo(neighborhood):
    for key, value in geo.items():
        if neighborhood in value:
            return key
        else:
            pass
    return np.nan
data_all['Geo'] = data_all['Neighborhood'].apply(find_geo)
print(data_all.groupby('Geo')['SalePrice'].mean())

# 2.2 Feature Engineering for Numerical Data

In [None]:
#fix skew for x
from scipy.stats import skew
skewed_feats = data_all[get_num_features(data_all)].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew_col = skewed_feats[abs(skewed_feats) > 0.5].index

In [None]:
for i in high_skew_col:
    data_all[i] = np.log1p(data_all[i])

# 2.3 Missing Values

In [None]:
###Missing Values
missing_counts = pd.DataFrame(data_all.isnull().sum().sort_values(ascending=False))
plt.figure(figsize=(50,20))
sns.heatmap(data_all.isnull())
plt.show()
plt.figure(figsize=(20,10))
missing_columns = missing_counts[missing_counts.iloc[:,0]>0]
sns.barplot(x=missing_columns.index,y=missing_columns.iloc[:,0])
plt.xticks(rotation=90)
plt.show()

In [None]:
#delete features with more than 1000 missing values
drop_col = list(missing_counts[missing_counts.iloc[:,0] > 1000].index)
drop_col.remove('SalePrice')
missing_columns = missing_columns.drop(index='SalePrice')
try:
    data_all = data_all.drop(columns=drop_col,axis=0)
    missing_columns = missing_columns.drop(index=drop_col,axis=1)
except:
    pass

In [None]:
print(data_all[missing_columns.index].info())
#Since some numerical features are ordinal ones
#it would be more reasonable to use median as the default values
#as for categorical ones, we opt for the mode category
missing_object = data_all[missing_columns.index].select_dtypes('object').columns
print('missing object',len(missing_object))
missing_num = data_all[missing_columns.index].select_dtypes(['int64','float64']).columns
print('missing num ',len(missing_num))

In [None]:
for i in missing_num:
    data_all[i] = data_all[i].fillna(data_all[i].median())
for j in missing_object:
    data_all[j] = data_all[j].fillna(data_all[j].mode()[0])
print(data_all.isnull().any().sum())
#1 missing is SalesPrice

# 2.4 Extreme Values

In [None]:
###Extreme Values
#here we only look at the important features discovered in EDA
print(data_all[important_features_CC].info())

In [None]:
for i in train.columns:
    if len(train[i].unique()) < 20:
        sns.violinplot(x=train[i],y=train['SalePrice'])
        plt.show()
    else:
        sns.scatterplot(x=train[i],y=train['SalePrice'])
        plt.show()

In [None]:
#drop only the two largest saleprice data
extreme_ind = train[train['SalePrice'] > 700000].index
data_all = data_all.drop(index=extreme_ind,axis=1)

In [None]:
#Get rid of non-related features
low_correl_col_num = list(Correlation[Correlation['Abs'] < 0.1].index)
try:
    low_correl_col_num.remove('MoSold')
except:
    pass
low_correl_col_cat = ['Street','LotShape','Utilities','LotConfig','LandSlope','RoofStyle']
#Possible Interaction
data_all = data_all.drop(columns=low_correl_col_num+low_correl_col_cat+['Id','Neighborhood','Sold_time','YrSold'])

# 2.5 Scaling and Transformation

In [None]:
all_num = get_num_features(data_all)
all_cat = get_cat_features(data_all)

In [None]:
for i in all_num:
    plt.hist(data_all[i])
    plt.title(i)
    plt.show()
#ID not included
#normalize all year feature
#standardized all other numerical feature
#log y

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scale_col = ['YearRemodAdd','YearBuilt']
data_all[scale_col] = scaler.fit_transform(data_all[scale_col])

In [None]:
#Considering price should be postive, no trans needed
data_all['SalePrice'] = np.log(data_all['SalePrice'])
sns.displot(data_all['SalePrice'])
plt.show()

> #  3.Preprocessing for Model

In [None]:
#Processing and let ElasticNetRegression build benchmark
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
data_all_processed_x = pd.get_dummies(data_all)
data_all_processed_y = data_all['SalePrice']
y_train = data_all['SalePrice'].dropna()
X_train = data_all_processed_x[~data_all_processed_x['SalePrice'].isnull()].drop(columns='SalePrice')
X_test = data_all_processed_x[data_all_processed_x['SalePrice'].isnull()].drop(columns='SalePrice')
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                    test_size=0.2, random_state=20210503)
print('X_train',X_train.shape)
print('X_val',X_val.shape)
print('X_test',X_test.shape)

> # 4.Model Building

# 4.1 Try out ElasticNet

Here I use ElasticNet as a bencemary, since it is relatively easier to interpret and it will shrink the coef against collinearity. It would be nice startout to see what you've done above and have a gist of how well you've done. If most of the models won't work out, it may mean that further feature engineering is needed.

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
ElasticNet = ElasticNet(random_state=0,max_iter=5000)
#scores = -cross_val_score(Ridge, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
parameters = {'alpha':[0.001, 0.0001, 1e-05]}
Grid = GridSearchCV(ElasticNet, parameters, cv=5, scoring='neg_root_mean_squared_error')
results = Grid.fit(X_train,y_train)
print('Best Para:',results.best_params_)
print('Best Score:',abs(results.best_score_))
#print(scores,'\n',np.mean(scores))
print('RMSE:',np.sqrt(mean_squared_error(y_val,results.best_estimator_.predict(X_val))))

In [None]:
#Send out Benchline without manually tuning and feature selection
prediction = results.best_estimator_.predict(X_test)

In [None]:
def get_Answer(prediction):
    Answer = pd.DataFrame(np.exp(prediction))
    Answer['Id'] = Answer.index + 1461
    Answer.columns = ['SalePrice','Id']
    Answer = Answer[['Id','SalePrice']]
    return Answer
Answer = get_Answer(prediction)

In [None]:
Answer.to_csv('Submit.csv',index=False)

Take a look at the contribution of the features, does it look similar to what you've expected in EDA.:)?


In [None]:
def get_contribution(estimator):
    Contribution = pd.DataFrame(np.abs(estimator.best_estimator_.coef_))
    Contribution = Contribution.sort_values(by=0,ascending=False)
    Contribution.index = X_train.columns
    Contribution.columns = ['coef']
    return Contribution
def visualize_contribution(df):
    fig,ax=plt.subplots(figsize=(40,20))
    plt.bar(x=df.index,height=df['coef'])
    plt.title('Feature Contribution')
    plt.xticks(rotation=90,fontsize=22)
    plt.show()

In [None]:
Contribution = get_contribution(results)
visualize_contribution(Contribution[Contribution['coef']>0])
visualize_contribution(Contribution[Contribution['coef']>0.05])

* My benchmark is Ranking 38%, RMSE=0.13 (This is done with just basic data cleansing and no hyperparameter tuning)
* Observed that the RMSE is pretty close(better) to the fitting result

# Steps can be done when the score is not what you desire
* Feature Selection (Create new features)
* Different Preprocessing measure
* Try different Models
* Model Tuning
* Emsemble methods / Stacking

# 4.2 Different Models and scoring metrics

In [None]:
from sklearn.metrics import SCORERS
SCORERS.keys()

In [None]:
#1.Tryout Different Models
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso
scoring = {'R^2_ad':'r2',
          'R^2':'explained_variance',
          'RMSE':'neg_root_mean_squared_error'}
DecisionTreeRegressor = DecisionTreeRegressor(random_state=0)
print('DecisionTree \n',cross_validate(DecisionTreeRegressor, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
LGBMRegressor = LGBMRegressor(random_state=0)
print('LGBMRegressor \n',cross_validate(LGBMRegressor, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
XGBRegressor = XGBRegressor()
print('XGBRegressor \n',cross_validate(XGBRegressor, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
SVR = SVR()
print('SVR \n',cross_validate(SVR, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
RandomForestRegressor = RandomForestRegressor()
print('SVR \n',cross_validate(RandomForestRegressor, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
Ridge = Ridge(random_state=0)
print('Ridge \n',cross_validate(Ridge, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
Lasso = Lasso(random_state=0)
print('Lasso \n',cross_validate(Lasso, X_train,y_train, scoring=scoring, cv=5, return_train_score=True))
#lgbm seems to have a better performance

In [None]:
prediction = results.best_estimator_.predict(X_test)
Answer =  get_Answer(prediction)

In [None]:
Answer.to_csv('Submit.csv',index=False)

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet
lgb_model = lgb.LGBMRegressor(colsample_bytree=0.25, learning_rate=0.01,
                              max_depth=13, min_child_samples=7, n_estimators=10000,
                              num_leaves=20, objective='regression', random_state=42)
xgb_model = xgb.XGBRegressor(colsample_bytree=0.25, gamma=0.0, learning_rate=0.01, max_depth=3,
                             n_estimators=15000, n_jobs=-1, random_state=42, 
                             reg_alpha=0.25, reg_lambda=0.4, subsample=1.0)
gbr_model = GradientBoostingRegressor(alpha=0.9,
                                      learning_rate=0.01, loss='huber',
                                      max_depth=13, max_features=0.1, min_samples_split=110,
                                      n_estimators=10000, n_iter_no_change=100, random_state=42)
svr_model = SVR(C=0.75, coef0=0.0001, degree=2, epsilon=0.0001, gamma=0.005, max_iter=10000)
lasso_model = Lasso(alpha=0.0001, max_iter=5000, random_state=42)
ridge_model = Ridge(alpha=2.5, max_iter=5000, random_state=42)
enet_model = ElasticNet(alpha=0.0002, l1_ratio=0.65, max_iter=5000, random_state=42)
models = [lgb_model,xgb_model,gbr_model,svr_model,lasso_model,ridge_model,enet_model]

In [None]:
for i in models:
    i.fit(X_train,y_train)
    print(str(i)+'RMSE:',np.sqrt(mean_squared_error(y_val,i.predict(X_val))))

# 4.3 Ensemble Methods and Submission

In [None]:
#Simple Ensemble with the top 3 algorithm
xgb_model.fit(pd.concat([X_train,X_val]),pd.concat([y_train,y_val]))
lgb_model.fit(pd.concat([X_train,X_val]),pd.concat([y_train,y_val]))
gbr_model.fit(pd.concat([X_train,X_val]),pd.concat([y_train,y_val]))
prediction = xgb_model.predict(X_test) * 0.4 + lgb_model.predict(X_test) * 0.3 + gbr_model.predict(X_test) * 0.3 
get_Answer(prediction).to_csv('0511_0237.csv',index=False)

This result ends up scoring Top15% with 0.12301 RMSE

Here is a sample code for Stacking. Since it's a practice I opt for simple average-ish emsemble method.

In [None]:
#Stacking
from sklearn.ensemble import StackingRegressor
base_models = [('Elastic',ElasticNet()),
             ('SVR',SVR()),
             ('XGB',XGBRegressor),
             ('DecisionTree',DecisionTreeRegressor),
              ('RandomForest',RandomForestRegressor)]

Stacking = StackingRegressor(
     estimators=base_models,
     final_estimator=LGBMRegressor)

Stacking.fit(X_train,y_train)
print('RMSE:',np.sqrt(mean_squared_error(y_val,Stacking.predict(X_val))))