# imports

In [None]:
import numpy as np
import pandas as pd 
import os,random
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder

from scipy import stats
from scipy.stats import norm, skew
from subprocess import check_output

from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from sklearn import tree

# variables

In [None]:
TRAIN_PATH = "../input/house-prices-advanced-regression-techniques/train.csv"
TEST_PATH = "../input/house-prices-advanced-regression-techniques/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/house-prices-advanced-regression-techniques/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "Id"
TARGET = "SalePrice"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

RS_CV = 3
RS_N_ITER = 50
RS_N_JOBS = -1
RS_VERBOSE = 1

# load & preprocess

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = "Missing"
                
checkNull_fillData(train)
checkNull_fillData(test)


col_names = []
for col in train:
    if train[col].dtypes == "object":
        col_names.append(col)
        
from sklearn.preprocessing import LabelEncoder

GROUP_FUNCTION = "mean"
ENCODED = "_encoded"

for col in col_names:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

    for label in np.unique(test[col]):
        if label not in encoder.classes_: 
            encoder.classes_ = np.append(encoder.classes_, label) 
    test[col] = encoder.transform(test[col])
    
for col in col_names: 
    train[col + ENCODED] = train.groupby(col)[TARGET].transform(GROUP_FUNCTION)
    di = train[[col,col + ENCODED]].drop_duplicates().set_index(col).to_dict()[col + ENCODED]
    train= train.replace({col:di})
    test= test.replace({col:di})
    
    train = train.drop([col + ENCODED],axis=1)

num_col = []
for col in train:
    if train[col].dtypes != "object" and col != TARGET:
        num_col.append(col)
        
scaler = StandardScaler()
train[num_col] = scaler.fit_transform(train[num_col])
test[num_col] = scaler.transform(test[num_col])

# Build Model

In [None]:
def rmse(y_pred, y_val):
    return mean_squared_error(y_val, y_pred,squared=False)

In [None]:
####################################################################################
#split input data and target data 
X = train.drop([ID,TARGET],axis=1)
y = train[TARGET]
####################################################################################
# search best parameter and model
model = tree.DecisionTreeRegressor()
parameters = {
#     'criterion':["squared_error", "friedman_mse"],
    'splitter':["best", "random"],
    'max_depth' : sp_randInt(1, 30),
    'min_samples_split' : sp_randFloat(),
    'min_impurity_decrease':sp_randFloat(),
#     'random_state':sp_randInt(1, 100),
}

rs = RandomizedSearchCV(estimator=model, 
                           param_distributions = parameters,
                           scoring=make_scorer(rmse, greater_is_better=False),
                           cv = RS_CV, 
                           n_iter = RS_N_ITER, 
                           verbose = RS_VERBOSE,
                           n_jobs=RS_N_JOBS)
rs.fit(X, y)

# best model 
print(rs.best_params_)
bestModel = rs.best_estimator_
####################################################################################

# After Building Model

In [None]:
####################################################################################
#predict
X_test = test.drop([ID],axis=1)
pred_test = bestModel.predict(X_test)
####################################################################################
#submit
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = pred_test
sub.to_csv(SUBMISSION_PATH, index=False)
sub.head()
####################################################################################