In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans

In [None]:
def load_data():
    df_train = pd.read_csv('../input/house-prices-dataset/train.csv', index_col='Id')
    df_test = pd.read_csv("../input/house-prices-dataset/test.csv", index_col="Id")
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    # Preprocessing
    df = clean(df)
    df = encode(df)
    df = impute(df)
    df = label_encode(df)
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test

In [None]:
def score_dataset(X,y,model=XGBRegressor()):
    log_y = np.log(y)
    score = cross_val_score(model, X, log_y, cv=5, scoring='neg_mean_squared_error')
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [None]:
def clean(df):
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    },inplace=True,
    )
    return df

Сделаем encoding всех categorical feature, то есть точно определим тип feature,
поскольку, например, "MSSubClass" Pandas читает как int, хотя на самом деле это категории

In [None]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig",
                "Condition1", "Condition2", "BldgType", "HouseStyle", 
                "RoofStyle", "RoofMatl", "MasVnrType", 
                "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", 
                "SaleType", "SaleCondition", 'Neighborhood', 'Exterior1st', 'Exterior2nd']

# The ordinal (ordered) categorical features 

# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Add a None level for missing values
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}


def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name].cat.add_categories("None", inplace=True)
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

In [None]:
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

Заполним значения NaN  с помощью SimpleImputer()

In [None]:
def simple_imputer(df):
    my_imputer = SimpleImputer()
    imputed_df = pd.DataFrame(my_imputer.fit_transform(df))
    imputed_df.columns = df.columns
    return imputed_df

A label encoding is okay for any kind of categorical feature when you're using a tree-ensemble like XGBoost, even for unordered categories.

In [None]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

In [None]:
df_train, df_test = load_data()
X_train = df_train.copy()
y_train = X_train.pop('SalePrice')

In [None]:
base_score = score_dataset(X_train, y_train)
print(f'Score: {base_score: .5f} RMSLE')

Нужно сделать feature engeneering.
- Удалить features которые не несут важной информации (mutual information)
- Добавить нужные features
- K-means, PCA

Посчитаем mutual information 

In [None]:
from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X,y):
    discrete_features = X.dtypes == int
    mi_scores = mutual_info_regression(X,y,discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name='MI Scores', index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=True)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

mi_scores = make_mi_scores(X_train, y_train)
plot_mi_scores(mi_scores[-10:])
mi_scores

# Drop uninformative features

In [None]:
def drop_uninformative(X, mi_scores):
    X_drop = X.loc[:, mi_scores > 0.0]
    X_drop["Threeseasonporch"] = X["Threeseasonporch"]
    return X_drop

drop_X_train = drop_uninformative(X_train, mi_scores)
score = score_dataset(drop_X_train,y_train)
print(f'Score: {score: .5f} RMSLE')

# Create new features

In [None]:
def mathematical_transform(df):
    X = pd.DataFrame()
    # отношение надземной жилой площади к общей
    X['LivLotRatio'] = df.GrLivArea / df.LotArea
    # отношение площади на первом и втором этажах к кол-ву комнат (не включая ванные комнаты)
    X['Spaciousness'] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    # общая внешняя площадь
#     X["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch
    X['QualCond'] = df.OverallQual * df.OverallCond
    return X

def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix='Bldg')
    X = X.mul(df.GrLivArea, axis=0)
    return X

def interactions_1(df):
    X = pd.get_dummies(df.BsmtQual, prefix='Bsmt')
    X = X.mul(df.TotalBsmtSF, axis=0)
    return X

def counts(df):
    X = pd.DataFrame()
    X['PorchTypes'] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "Threeseasonporch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return X

def group_transforms(df):
    X = pd.DataFrame()
    X['MedNhbdArea'] = df.groupby('Neighborhood')['GrLivArea'].transform('median')
    return X

def sqrt_area(X):
    X_new = X.copy()
    features = ['LotArea', 'BsmtUnfSF', 'TotalBsmtSF', 'FirstFlrSF', 'SecondFlrSF',
                'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                'EnclosedPorch', 'Threeseasonporch', 'ScreenPorch', 'PoolArea']
    for feature in features:
        X_new[feature] = X_new[feature].apply(np.sqrt)
    return X_new

def log(X):
    X_new = X.copy()
    for feature in X.columns:
        if X[feature].dtype=='float64' or X[feature].dtype=='int64':
            X_new[feature] = X_new[feature].apply(np.log)
    return X_new

In [None]:
def create_math_transform(X):
#     X = sqrt_area(X)
#     X = log(X)
    X = X.join(mathematical_transform(X))
    X = X.join(interactions(X))
#     X = X.join(interactions_1(X))
    X = X.join(counts(X))
    X = X.join(group_transforms(X))
    
    return X

In [None]:
X_train_CF = create_math_transform(drop_X_train)
score = score_dataset(X_train_CF,y_train)
print(f'Score: {score: .5f} RMSLE')

# K-means Clustering

In [None]:
cluster_features = [
    "LotArea",
    "TotalBsmtSF",
    "FirstFlrSF",
    "SecondFlrSF",
    "GrLivArea",
]

def cluster_labels(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    X_new = pd.DataFrame()
    X_new['Cluster'] = kmeans.fit_predict(X_scaled)
    return X_new

def cluster_distance(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled-X_scaled.mean(axis=0))/X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    X_cd = kmeans.fit_transform(X_scaled)
    X_cd = pd.DataFrame(X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])])
    return X_cd

# PCA (Principal Component analysis)

In [None]:
def apply_pca(X, standardize=True):
    if standardize:
        X = (X-X.mean(axis=0)) / X.std(axis=0)
        pca = PCA()
        X_pca = pca.fit_transform(X)
        component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
        X_pca = pd.DataFrame(X_pca, columns=component_names)
        loadings = pd.DataFrame(pca.components_.T, columns=component_names, index=
                               X.columns)
        return pca, X_pca, loadings

In [None]:
def pca_inspired(df):
    X = pd.DataFrame()
    X['Feature1'] = df.GrLivArea + df.TotalBsmtSF
    X['Feature2'] = df.YearRemodAdd * df.TotalBsmtSF
    return X

def pca_components(df, features):
    X = df.loc[:, features]
    _, X_pca, _ = apply_pca(X)
    return X_pca

pca_features = [
    "GarageArea",
    "YearRemodAdd",
    "TotalBsmtSF",
    "GrLivArea",
]

In [None]:
def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )


corrplot(X_train, annot=None)

In [None]:
def indicate_outliers(df):
    X_new = pd.DataFrame()
    X_new['Outlier'] = (df.Neighborhood == 'Edwards') & (df.SaleCondition == 'Partial')
    return X_new

# Create final features

In [None]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop("SalePrice")
    mi_scores = make_mi_scores(X, y)

    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("SalePrice")
        X = pd.concat([X, X_test])


    X = drop_uninformative(X, mi_scores)
    X = create_math_transform(X)

    X = X.join(cluster_labels(X, cluster_features, n_clusters=20))
#     X = X.join(cluster_distance(X, cluster_features, n_clusters=20))

    X = X.join(pca_inspired(X))
#     X = X.join(pca_components(X, pca_features))
    X = X.join(indicate_outliers(X))


    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    if df_test is not None:
        return X, X_test
    else:
        return X


df_train, df_test = load_data()
X_train = create_features(df_train)
y_train = df_train.loc[:, "SalePrice"]

score_dataset(X_train, y_train)

# Tune hyperparameters

In [None]:
X_train = create_features(df_train)
y_train = df_train.loc[:, "SalePrice"]

xgb_params = dict(
    max_depth=6,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
)

xgb = XGBRegressor(**xgb_params)
score_dataset(X_train, y_train, xgb)

# Create submissions

In [None]:
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, "SalePrice"]

xgb = XGBRegressor(**xgb_params)
xgb.fit(X_train, np.log(y_train))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)