# Load Dattasets from Kaggle API

# Import packages and load dataset

In [None]:
import os
import warnings
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
from pandas.api.types import CategoricalDtype

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

import xgboost as xgb
xgb.set_config(verbosity=0)

# Mute warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
data_dir = Path("/kaggle/input/house-prices-advanced-regression-techniques/")
train_full = pd.read_csv(data_dir / "train.csv", index_col="Id")
test_full = pd.read_csv(data_dir / "test.csv", index_col="Id")
df = pd.concat([train_full, test_full])

X = df.copy()
y = X.pop('SalePrice')

# Data Preprocessing

1.   Clean Data
2.   Encode the Statistical Data Type
3.   Handle Missing Values
4.   Mutual Imformation (Feature Utility Scores)
5.   Creature Feature 
6. OneHot Encoder (Pipeline)
7. Standard Scaler(Pipeline

## Clean Data

In [None]:
X["Exterior2nd"] = X["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
# Some values of GarageYrBlt are corrupt, so we'll replace them
# with the year the house was built
X["GarageYrBlt"] = X["GarageYrBlt"].where(X.GarageYrBlt <= 2010, X.YearBuilt)
# Names beginning with numbers are awkward to work with
X.rename(columns={ "1stFlrSF": "FirstFlrSF",
                    "2ndFlrSF": "SecondFlrSF",
                    "3SsnPorch": "Threeseasonporch",}, inplace=True,)

## Encode the Statistical Data Type

### The unordered (nominative) categorical features

These features are read as 'int' type, but is actually a nominative categorical.

In [None]:
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", 
                "LotConfig", "Neighborhood", "Condition1", "Condition2", 
                "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", 
                "Exterior2nd", "MasVnrType", "Foundation", "Heating", 
                "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

for name in features_nom:
    X[name] = X[name].astype("category")
    # Add a None category for missing values
    if "None" not in X[name].cat.categories:
        X[name].cat.add_categories("None", inplace=True)

### The ordered categorical features

In [None]:
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
# ten_levels = [str(i) for i in range(10)]
ten_levels = list(range(10))
features_ordered = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}
# Add a None level for missing values
features_ordered = {key: ["None"] + value for key, value in features_ordered.items()}

for name, levels in features_ordered.items():
    X[name] = X[name].astype(CategoricalDtype(levels, ordered=True))

In [None]:
X['OverallQual'] = X['OverallQual'].apply(str)
X['OverallCond'] = X['OverallCond'].apply(str)

## Handle Missing Values

In [None]:
for name in X.select_dtypes('number'):
    X[name] = X[name].fillna(0)
for name in X.select_dtypes('category'):
    X[name] = X[name].fillna('None')

## Feature Selection
    1. Remove features with low variance

In [None]:
X_threshold = X.loc[train_full.index]
for colname in X_threshold.select_dtypes(["object", "category"]):
    X_threshold[colname], _ = X_threshold[colname].factorize()

In [None]:
varThres = VarianceThreshold(threshold=0.99*(1-0.99))
varThres.fit(X_threshold)
X = X[X_threshold.columns[varThres.get_support()]]

## Feature Utility Scores

### Make Mi Scores

In [None]:
X_scores = X.loc[train_full.index,:]
y_scores = y.loc[train_full.index]
for colname in X_scores.select_dtypes(["object", "category"]):
    X_scores[colname], _ = X_scores[colname].factorize()
# All discrete features should now have integer dtypes
discrete_features = [pd.api.types.is_integer_dtype(t) for t in X_scores.dtypes]
mi_scores = mutual_info_regression(X_scores, y_scores, discrete_features=discrete_features, random_state=42)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_scores.columns)

# Plot MI Scores
mi_scores = mi_scores.sort_values(ascending=True)
width = np.arange(len(mi_scores))
ticks = list(mi_scores.index)

plt.barh(width, mi_scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")

## Apply Mi Scores to remove Uninformative Columns (mi_scores = 0.0)

In [None]:
X = X.loc[:, mi_scores > 0.0]

## Create Features

In [None]:
# Relationships among numerical features are often expressed through mathematical formulas
def mathematical_transforms(df):
    X = pd.DataFrame()  # dataframe to hold new features
    X["LivLotRatio"] = df.GrLivArea / df.LotArea
    X["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    return X


# Creat new columns from the original indexes
def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix="Bldg") 
    X = X.mul(df.GrLivArea, axis=0)
    return X


# Features describing the presence or absence of something often come in sets
def counts(df):
    X = pd.DataFrame()
    X["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "Threeseasonporch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return X


# Break into simpler pieces
def break_down(df):
    X = pd.DataFrame()
    X["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]
    return X


# Aggregate informations across multiple rows grouped by some category.
def group_transforms(df):
    X = pd.DataFrame()
    X["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    return X

X = X.join(mathematical_transforms(X))
X = X.join(interactions(X))
X = X.join(counts(X))
X = X.join(group_transforms(X))

In [None]:
# List out all features that are categorical
numerical_cols = X.select_dtypes(include= ['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include = ['category']).columns

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.compose import make_column_selector as selector
# Build Transformer pipeline
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                         ])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore')),
                                           ])
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, selector(dtype_exclude='category')),
                                              ('cat', categorical_transformer, selector(dtype_include='category')),
                                              ])

### Note: 


1.   We chose OHE(OneHot Encoder), not Label Encoder because Label Encoder is not fair with all Categories Variables. 
2.   A disadvantage of OHE is high cardinality, we can use PCA to reduce data dimension.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X.loc[train_full.index,:], y.loc[train_full.index], test_size=0.25, random_state=42)
X_val_prep = preprocessor.fit_transform(X_val)

X_test = X.loc[test_full.index,:]

In [None]:
# y_test = pd.read_csv('/kaggle/input/ames-dataset/result-with-best.csv').SalePrice

In [None]:
# xgb_reg = xgb.XGBRegressor(random_state=42)
# ames_pipe = Pipeline(steps=[('preprocessor_transformers', preprocessor),
#                             ('model', xgb_reg)])

# Fine-Tuning Model : Using RandomizedSearchCV

In [None]:
# xgb_param_grid = {'model__subsample': np.arange(0.1, 1, 0.1),
#                   'model__max_depth': np.arange(2,10,1),
#                   'model__colsample_bytree': np.arange(0.1, 1.05, 0.05),
#                   'model__eta': [0.001,0.01,0.1],
#                   'model__min_child_weight' : np.arange(1,10,1),
#                   'model__n_estimators' : [2000,3000],
#                   'model__reg_alpha' : np.arange(0.1, 1, 0.1),
#                  }
# xgb_fit_params = {'model__eval_set':[(X_val, y_val)],
#                   'model__early_stopping_rounds': 10
#                  }
# randomized_neg_mse = RandomizedSearchCV(estimator= ames_pipe,
#                                         param_distributions=xgb_param_grid,
#                                         scoring='neg_mean_squared_error',
#                                         verbose=0,
#                                         cv=10)
# randomized_neg_mse.fit(X_train, y_train, 
#                        **xgb_fit_params
#                       )

In [None]:
# best_param = randomized_neg_mse.best_params_
# best_param

In [None]:
# best_estimator = randomized_neg_mse.best_estimator_

In [None]:
# X_test1 = X.loc[test_full.index,:]
# y_preds = best_estimator.predict(X_test)

In [None]:
xgb_reg = xgb.XGBRegressor(subsample=0.7000000000000001,
                           n_estimators=2000,
                           min_child_weight=1,
                           max_depth=6,
                           eta=0.01,
                           colsample_bytree=0.7,
                           reg_alpha=0.5,
                           reg_lambda=1.0,
                           num_parallel_tree=1,
                           random_state=42, verbosity=0,seed=123)
ames_pipe = Pipeline(steps=[('preprocessor_transformers', preprocessor),
                            ('model', xgb_reg)])

xgb_fit_params = {'model__eval_set':[(X_val_prep, y_val)],
                  'model__eval_metric':'rmse',
#                   'model__early_stopping_rounds': 10
                 }
ames_pipe.fit(X_train, y_train,
              **xgb_fit_params)

In [None]:
y_val_preds = ames_pipe.predict(X_val)
score_val = np.sqrt(mean_squared_error(np.log(y_val), np.log(y_val_preds)))
score_val

In [None]:
# Using new data to check the score
# Only submition when get better result.
# y_preds = ames_pipe.predict(X_test)
# from sklearn.metrics import mean_squared_error
# solution = pd.read_csv('/kaggle/input/ames-dataset/result-with-best.csv')
# score = np.sqrt(mean_squared_error(np.log(solution.SalePrice), np.log(y_preds)))
# score

# Create Submission File

In [None]:
sample_sub = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
y_preds = ames_pipe.predict(X_test)
final_data = {'Id': sample_sub.Id, 'SalePrice': y_preds}

final_submission = pd.DataFrame(data=final_data).to_csv('submission_file.csv', 
                                                         index=False)