In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df = train.append(test).reset_index(drop=True)
df.head()

**EDA**

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

In [None]:
df.shape

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car


In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

**Categorical Features Analysis**

In [None]:

def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()

In [None]:
for col in cat_cols:
    cat_summary(df, col)

In [None]:
for col in cat_but_car:
    cat_summary(df, col)

**Numerical Features Analysis**

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()


In [None]:
df[num_cols].describe().T

for col in num_cols:
    num_summary(df, col, plot=True)

**Missing Values Analysis**

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns


def missing_vs_target(dataframe, target, na_columns):
    temp_df = dataframe.copy()
    for col in na_columns:
        temp_df[col + '_NA_FLAG'] = np.where(temp_df[col].isnull(), 1, 0)
    na_flags = temp_df.loc[:, temp_df.columns.str.contains("_NA_")].columns
    for col in na_flags:
        print(pd.DataFrame({"TARGET_MEAN": temp_df.groupby(col)[target].mean(),
                            "Count": temp_df.groupby(col)[target].count()}), end="\n\n\n")


In [None]:
missing_vs_target(df, "SalePrice", missing_values_table(df, na_name=True))
missing_values_table(df)

In [None]:
none_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
             'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
zero_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
             'GarageArea', 'GarageCars', 'MasVnrArea']
freq_cols = ['Exterior1st', 'Exterior2nd', 'KitchenQual', 'Electrical']

for col in zero_cols:
    df[col].replace(np.nan, 0, inplace=True)
for col in none_cols:
    df[col].replace(np.nan, "None", inplace=True)
for col in freq_cols:
    df[col].replace(np.nan, df[col].mode()[0], inplace=True)

In [None]:
df["Alley"] = df["Alley"].fillna("None")
df["PoolQC"] = df["PoolQC"].fillna("None")
df["MiscFeature"] = df["MiscFeature"].fillna("None")
df["Fence"] = df["Fence"].fillna("None")
df["FireplaceQu"] = df["FireplaceQu"].fillna("None")
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))


df["GarageCars"] = df["GarageCars"].fillna(0)

In [None]:
df.drop(['GarageArea'], axis=1, inplace=True)
df.drop(['GarageYrBlt'], axis=1, inplace=True)


In [None]:
df.drop(['Utilities'], axis=1, inplace=True)

In [None]:
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].apply(lambda x: x.fillna(x.mode()[0]))

df["Functional"] = df["Functional"].fillna("Typ")

df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])

df['YrSold'] = df['YrSold'].astype(str)

**Target Analysis**

In [None]:
df["SalePrice"].describe([0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 0.99])

def target_correlation_matrix(dataframe, corr_th=0.5, target="SalePrice"):
    corr = dataframe.corr()
    corr_th = corr_th

    try:
        filter = np.abs(corr[target]) > corr_th
        corr_features = corr.columns[filter].tolist()
        sns.clustermap(dataframe[corr_features].corr(), annot=True, fmt=".2f")
        plt.show()
        return corr_features
    except:
        print("Yüksek threshold değeri, corr_th değerinizi düşürün!")

target_correlation_matrix(df, corr_th=0.5, target="SalePrice")


In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")


**Data Preprocessing & Feature Engineering**

In [None]:
df.groupby("Neighborhood").agg({"SalePrice": "mean"}).sort_values(by="SalePrice", ascending=False)  

In [None]:
nhood_map = {'MeadowV': 1, 'IDOTRR': 1, 'BrDale': 1,'BrkSide': 2, 'Edwards': 2, 'OldTown': 2,'Sawyer': 3, 'Blueste': 3,'SWISU': 4, 'NPkVill': 4, 'NAmes': 4, 'Mitchel': 4,'SawyerW': 5, 'NWAmes': 5,'Gilbert': 6, 'Blmngtn': 6, 'CollgCr': 6,'Crawfor': 7, 'ClearCr': 7,'Somerst': 8, 'Veenker': 8, 'Timber': 8,'StoneBr': 9, 'NridgHt': 9,'NoRidge': 10}

df['Neighborhood'] = df['Neighborhood'].map(nhood_map).astype('int')

In [None]:
df= df.replace({"MSSubClass": {20: "SC20", 30: "SC30", 40: "SC40", 45: "SC45", \
50: "SC50", 60: "SC60", 70: "SC70", 75: "SC75", \
80: "SC80", 85: "SC85", 90: "SC90", 120: "SC120", \
150: "SC150", 160: "SC160", 180: "SC180", 190: "SC190"},
"MoSold": {1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun", \
7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"}})

In [None]:
func = {"Sal": 0, "Sev": 1, "Maj2": 2, "Maj1": 3, "Mod": 4, "Min2": 5, "Min1": 6, "Typ": 7}
df["Functional"] = df["Functional"].map(func).astype("int")
df.groupby("Functional").agg({"SalePrice": "mean"})

In [None]:
# MSZoning
df.loc[(df["MSZoning"] == "C (all)"), "MSZoning"] = 1
df.loc[(df["MSZoning"] == "RM"), "MSZoning"] = 2
df.loc[(df["MSZoning"] == "RH"), "MSZoning"] = 2
df.loc[(df["MSZoning"] == "RL"), "MSZoning"] = 3
df.loc[(df["MSZoning"] == "FV"), "MSZoning"] = 3

In [None]:
# LotShape
df.groupby("LotShape").agg({"SalePrice": "mean"}).sort_values(by="SalePrice", ascending=False)
shape_map = {"Reg": 1, "IR1": 2, "IR3": 3, "IR2": 4}
df['LotShape'] = df['LotShape'].map(shape_map).astype('int')

In [None]:
# LandContour
df.groupby("LandContour").agg({"SalePrice": "mean"}).sort_values(by="SalePrice", ascending=False)
contour_map = {"Bnk": 1, "Lvl": 2, "Low": 3, "HLS": 4}
df['LandContour'] = df['LandContour'].map(contour_map).astype('int')

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
# LotConfig
df.loc[(df["LotConfig"] == "Inside"), "LotConfig"] = 1
df.loc[(df["LotConfig"] == "FR2"), "LotConfig"] = 1
df.loc[(df["LotConfig"] == "Corner"), "LotConfig"] = 1
df.loc[(df["LotConfig"] == "FR3"), "LotConfig"] = 2
df.loc[(df["LotConfig"] == "CulDSac"), "LotConfig"] = 2

In [None]:
# Condition1
cond1_map = {"Artery": 1, "RRAe": 1, "Feedr": 1,"Norm": 2, "RRAn": 2, "RRNe": 2,"PosN": 3, "RRNn": 3, "PosA": 3}
df['Condition1'] = df['Condition1'].map(cond1_map).astype('int')


In [None]:
# BldgType
df.loc[(df["BldgType"] == "2fmCon"), "BldgType"] = 1
df.loc[(df["BldgType"] == "Duplex"), "BldgType"] = 1
df.loc[(df["BldgType"] == "Twnhs"), "BldgType"] = 1
df.loc[(df["BldgType"] == "1Fam"), "BldgType"] = 2
df.loc[(df["BldgType"] == "TwnhsE"), "BldgType"] = 2

# RoofStyle
df.groupby("RoofStyle").agg({"SalePrice": "mean"}).sort_values(by="SalePrice", ascending=False)
df.loc[(df["RoofStyle"] == "Gambrel"), "RoofStyle"] = 1
df.loc[(df["RoofStyle"] == "Gablee"), "RoofStyle"] = 2
df.loc[(df["RoofStyle"] == "Mansard"), "RoofStyle"] = 3
df.loc[(df["RoofStyle"] == "Flat"), "RoofStyle"] = 4
df.loc[(df["RoofStyle"] == "Hip"), "RoofStyle"] = 5
df.loc[(df["RoofStyle"] == "Shed"), "RoofStyle"] = 6

# RoofMatl
df.groupby("RoofMatl").agg({"SalePrice": "mean"}).sort_values(by="SalePrice", ascending=False)
df.loc[(df["RoofMatl"] == "Roll"), "RoofMatl"] = 1
df.loc[(df["RoofMatl"] == "ClyTile"), "RoofMatl"] = 2
df.loc[(df["RoofMatl"] == "CompShg"), "RoofMatl"] = 3
df.loc[(df["RoofMatl"] == "Metal"), "RoofMatl"] = 3
df.loc[(df["RoofMatl"] == "Tar&Grv"), "RoofMatl"] = 3
df.loc[(df["RoofMatl"] == "WdShake"), "RoofMatl"] = 4
df.loc[(df["RoofMatl"] == "Membran"), "RoofMatl"] = 4
df.loc[(df["RoofMatl"] == "WdShngl"), "RoofMatl"] = 5

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
# ExterQual
df.groupby("ExterQual").agg({"SalePrice": "mean"}).sort_values(by="SalePrice", ascending=False)
ext_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['ExterQual'] = df['ExterQual'].map(ext_map).astype('int')

# ExterCond
ext_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['ExterCond'] = df['ExterCond'].map(ext_map).astype('int')

# BsmtQual
bsm_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['BsmtQual'] = df['BsmtQual'].map(bsm_map).astype('int')

# BsmtCond
bsm_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['BsmtCond'] = df['BsmtCond'].map(bsm_map).astype('int')

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
# BsmtFinType1
bsm_map = {'None': 0, 'Rec': 1, 'BLQ': 1, 'LwQ': 2, 'ALQ': 3, 'Unf': 3, 'GLQ': 4}
df['BsmtFinType1'] = df['BsmtFinType1'].map(bsm_map).astype('int')

# BsmtFinType2
bsm_map = {'None': 0, 'BLQ': 1, 'Rec': 2, 'LwQ': 2, 'Unf': 3, 'GLQ': 3, 'ALQ': 4}
df['BsmtFinType2'] = df['BsmtFinType2'].map(bsm_map).astype('int')

# BsmtExposure
bsm_map = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
df['BsmtExposure'] =df['BsmtExposure'].map(bsm_map).astype('int')

# Heating
heat_map = {'Floor': 1, 'Grav': 1, 'Wall': 2, 'OthW': 3, 'GasW': 4, 'GasA': 5}
df['Heating'] = df['Heating'].map(heat_map).astype('int')

# HeatingQC
heat_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['HeatingQC'] = df['HeatingQC'].map(heat_map).astype('int')

# KitchenQual
kitch_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['KitchenQual'] = df['KitchenQual'].map(heat_map).astype('int')

# FireplaceQu
fire_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['FireplaceQu'] = df['FireplaceQu'].map(fire_map).astype('int')

# GarageCond
garage_map = {'None': 1, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['GarageCond'] = df['GarageCond'].map(garage_map).astype('int')

# GarageQual
garage_map = {'None': 1, 'Po': 1, 'Fa': 2, 'TA': 3, 'Ex': 4, 'Gd': 5}
df['GarageQual'] = df['GarageQual'].map(garage_map).astype('int')

# PavedDrive
paved_map = {'N': 1, 'P': 2, 'Y': 3}
df['PavedDrive'] = df['PavedDrive'].map(paved_map).astype('int')

# CentralAir
cent = {"N": 0, "Y": 1}
df["CentralAir"] = df["CentralAir"].map(cent).astype("int")
df.groupby("CentralAir").agg({"SalePrice": "mean"})

# LandSlope
df.loc[df["LandSlope"] == "Gtl", "LandSlope"] = 1
df.loc[df["LandSlope"] == "Sev", "LandSlope"] = 2
df.loc[df["LandSlope"] == "Mod", "LandSlope"] = 2
df["LandSlope"] = df["LandSlope"].astype("int")

# OverallQual
df.loc[df["OverallQual"] == 1, "OverallQual"] = 1
df.loc[df["OverallQual"] == 2, "OverallQual"] = 1
df.loc[df["OverallQual"] == 3, "OverallQual"] = 1
df.loc[df["OverallQual"] == 4, "OverallQual"] = 2
df.loc[df["OverallQual"] == 5, "OverallQual"] = 3
df.loc[df["OverallQual"] == 6, "OverallQual"] = 4
df.loc[df["OverallQual"] == 7, "OverallQual"] = 5
df.loc[df["OverallQual"] == 8, "OverallQual"] = 6
df.loc[df["OverallQual"] == 9, "OverallQual"] = 7
df.loc[df["OverallQual"] == 10, "OverallQual"] = 8

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
df["NEW"] = df["GarageCars"] * df["OverallQual"]
df["NEW3"] = df["TotalBsmtSF"] * df["1stFlrSF"]
df["NEW4"] = df["TotRmsAbvGrd"] * df["GrLivArea"]
df["NEW5"] = df["FullBath"] * df["GrLivArea"]
df["NEW6"] = df["YearBuilt"] * df["YearRemodAdd"]
df["NEW7"] = df["OverallQual"] * df["YearBuilt"]
df["NEW8"] = df["OverallQual"] * df["RoofMatl"]
df["NEW9"] = df["PoolQC"] * df["OverallCond"]
df["NEW10"] = df["OverallCond"] * df["MasVnrArea"]
df["NEW11"]  = df["LotArea"] * df["GrLivArea"]
df["NEW12"] = df["FullBath"] * df["GrLivArea"]
df["NEW13"] = df["FullBath"] * df["TotRmsAbvGrd"]
df["NEW14"] = df["1stFlrSF"] *df["TotalBsmtSF"]
df["New_Home_Quality"] =  df["OverallCond"] / df["OverallQual"]
df['POOL'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['HAS2NDFLOOR'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df["LUXURY"] = df["1stFlrSF"] + df["2ndFlrSF"]
df["New_TotalBsmtSFRate"] = df["TotalBsmtSF"] / df["LotArea"]
df['TotalPorchArea'] = df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
df['IsNew'] = df.YearBuilt.apply(lambda x: 1 if x > 2000 else 0)
df['IsOld'] = df.YearBuilt.apply(lambda x: 1 if x < 1946 else 0)


In [None]:
def rare_encoder(dataframe, rare_perc, cat_cols):
    temp_df = dataframe.copy()

    rare_columns = [col for col in dataframe.columns if (dataframe[col].value_counts() / len(dataframe) < 0.01).sum()>1]

    for var in rare_columns:
        tmp = dataframe[col].value_counts() / len(dataframe)
        rare_labels = tmp[tmp < rare_perc].index
        dataframe[col] = np.where(dataframe[col].isin(rare_labels), 'Rare', dataframe[col])

    return temp_df

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

df = rare_encoder(df, 0.01, cat_cols)

rare_analyser(df, "SalePrice", cat_cols)

In [None]:
useless_cols = [col for col in cat_cols if df[col].nunique() == 1 or
                (df[col].nunique() == 2 and (df[col].value_counts() / len(df) <= 0.02).any(axis=None))]


useless_cols

In [None]:
cat_cols = [col for col in cat_cols if col not in useless_cols]
df.shape

In [None]:
for col in useless_cols:
    df.drop(col, axis=1, inplace=True)
df.shape

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

**Label Encoding & One-Hot Encoding**

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe



def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
df = one_hot_encoder(df, cat_cols, drop_first=True)
df.shape

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
useless_cols_new = [col for col in cat_cols if (df[col].value_counts() / len(df) <= 0.01).any(axis=None)]
useless_cols_new

In [None]:
for col in useless_cols_new:
    df.drop(col, axis=1, inplace=True)
df.shape

In [None]:
missing_values_table(df)

test.shape

missing_values_table(train)


na_cols = [col for col in df.columns if df[col].isnull().sum() > 0 and "SalePrice" not in col]

df[na_cols] = df[na_cols].apply(lambda x: x.fillna(x.median()), axis=0)

**Check Outliers**

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.10, q3=0.90):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

for col in num_cols:
    print(col, check_outlier(df, col))

for col in num_cols:
    replace_with_thresholds(df, col)

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

**MODELING**

In [None]:
train_df = df[df['SalePrice'].notnull()]
test_df = df[df['SalePrice'].isnull()].drop("SalePrice", axis=1)


y = np.log1p(train_df['SalePrice'])
X = train_df.drop(["Id", "SalePrice"], axis=1)

**Base Models**

In [None]:
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          ('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
          ("CatBoost", CatBoostRegressor(verbose=False))]

In [None]:
for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")

**Hyperparameter Optimization**

In [None]:
lgbm_model = LGBMRegressor(random_state=46)

# modelleme öncesi hata:
rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model,
                                        X, y, cv=10, scoring="neg_mean_squared_error")))


In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1, 0.03, 0.2, 0.5],
               "n_estimators": [100, 200, 250, 500, 1500],
               "colsample_bytree": [0.3,0.4, 0.5, 0.7, 1]}

In [None]:
lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X, y)

In [None]:
final_model_lgbm = lgbm_model.set_params(**lgbm_gs_best.best_params_).fit(X, y)
rmse = np.mean(np.sqrt(-cross_val_score(final_model_lgbm, X, y, cv=10, scoring="neg_mean_squared_error")))

In [None]:
rmse

In [None]:
# CatBoost

catboost_model = CatBoostRegressor(random_state = 46)

catboost_params = {"iterations": [200, 250, 300, 500],
                   "learning_rate": [0.01, 0.1, 0.2, 0.5],
                   "depth": [3, 6]}

rmse = np.mean(np.sqrt(-cross_val_score(catboost_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))

In [None]:
cat_gs_best = GridSearchCV(catboost_model,
catboost_params,cv=3,n_jobs=-1,verbose=False).fit(X, y)

final_model_cat = catboost_model.set_params(**cat_gs_best.best_params_).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(final_model_cat, X, y, cv=10, scoring="neg_mean_squared_error")))

In [None]:
rmse

In [None]:
# GBM

gbm_model = GradientBoostingRegressor(random_state=46)

rmse = np.mean(np.sqrt(-cross_val_score(gbm_model,
                                        X, y, cv=5, scoring="neg_mean_squared_error")))

gbm_params = {"learning_rate": [0.01,0.05,0.1],"max_depth": [3,5,8],"n_estimators": [500,1000,1500],"subsample": [1, 0.5, 0.7]}

In [None]:
gbm_gs_best = GridSearchCV(gbm_model,gbm_params,cv=5,n_jobs=-1,verbose=True).fit(X, y)

final_model_gbm = gbm_model.set_params(**gbm_gs_best.best_params_).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(final_model_gbm, X, y, cv=10, scoring="neg_mean_squared_error")))

In [None]:
rmse