# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Import necessary packages

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

## Read in data

In [1]:
train = pd.read_csv("../input/home-data-for-ml-course/train.csv")
train

In [1]:
test = pd.read_csv("../input/home-data-for-ml-course/test.csv")
test

## Exploratory Data Analysis

In [1]:
# the SalePrice is right skewed
plt.hist(train.SalePrice)
plt.xlabel("$")
plt.ylabel("Count")
plt.title("Sale Price")
plt.show()

In [1]:
train["SalePrice"] = np.log1p(train["SalePrice"]) # Normalising target variable

In [1]:
plt.hist(train.SalePrice)
plt.xlabel("log1($)")
plt.ylabel("Count")
plt.title("Sale Price Post-Transform")
plt.show()

In [1]:
list(train.columns)

In [1]:
train.describe()

In [1]:
train.info()

In [1]:
for c in train.columns:
    print("\n---- %s ---" % c)
    print(train[c].value_counts())

### Categorical & Numerical features

#### Categorical

In [1]:
cat_feat = train.select_dtypes(include=[np.object])
cat_feat.info()

In [1]:
cat_feat.nunique()

#### Numerical

In [1]:
num_feat = train.select_dtypes(include=[np.number])
num_feat.info()

In [1]:
num_feat.hist(figsize = (30, 30))

In [1]:
correlation_matrix = num_feat.corr()

plt.figure(figsize=(40, 40))

ax = sns.heatmap(
    correlation_matrix,
    vmax=1,
    square=True,
    annot=True,
    fmt='.2f',
    cmap='BuPu',
    cbar_kws={"shrink": .5},
    annot_kws={"size": 15},
    robust=True
)

plt.title('Correlation Matrix of features', fontsize=20)

In [1]:
correlation_matrix[['SalePrice']].sort_values(['SalePrice'], ascending = False)

### Outliners

In [1]:
temp_df_num = train.select_dtypes(include = [np.number])
fig = plt.figure(figsize = (20, 25))
for i in range(len(temp_df_num.columns)):
    plt.subplot(13, 3, i + 1)
    sns.scatterplot(x = temp_df_num.columns[i], 
                    y = temp_df_num['SalePrice'], 
                    data = temp_df_num)
    plt.title(temp_df_num.columns[i])
fig.tight_layout(pad = 1.0)

`GrLivArea` has outliners.

### Missing values

In [1]:
pd.set_option('display.max_rows', 500)
display(train.isnull().sum())
pd.reset_option('display.max_rows')

In [1]:
cols_with_missing = [col for col in train.columns
                     if train[col].isnull().any()]
cols_with_missing

In [1]:
msno.matrix(train[cols_with_missing])

In [1]:
msno.bar(train[cols_with_missing])

In [1]:
for col_name in cols_with_missing:
    print(f'{col_name}:\n{train[col_name].unique()}\n')
    col_isna = train[col_name].isna().sum()
    col_count = train[col_name].count()
    print(f'isna: {col_isna}\n')
    print(f'count: {col_count}\n')
    print(f'%: {col_isna/(col_isna + col_count)*100}\n\n----------------------------------------\n')

## Cleaning the data

In [1]:
# save the 'Id' column
test_id = test.Id

In [1]:
# features to remove due to collinearity
redun = ['GarageYrBlt','TotRmsAbvGrd','1stFlrSF','GarageCars']

train.drop(redun, axis = 1, inplace = True)
test.drop(redun, axis = 1, inplace = True)

num_feat = list(set(num_feat)-set(redun))
cat_feat = list(set(cat_feat)-set(redun))

In [1]:
useless = ['YrSold','MoSold', 'Id']

train.drop(useless, axis = 1, inplace = True)
test.drop(useless, axis = 1, inplace = True)

num_feat = list(set(num_feat)-set(useless))
cat_feat = list(set(cat_feat)-set(useless))

In [1]:
# too sparse
sparse = ['PoolQC', 'MiscFeature', 'Alley']

train.drop(sparse, axis = 1, inplace = True)
test.drop(sparse, axis = 1, inplace = True)

num_feat = list(set(num_feat)-set(sparse))
cat_feat = list(set(cat_feat)-set(sparse))

In [1]:
# Removing outliers
train = train[train.GrLivArea < 4500] 
# train = train.drop(train[(train['OverallQual'] > 9) & (train['SalePrice'] < 220000)].index)

In [1]:
# Removing SalePrice from features
num_feat.remove("SalePrice")

In [1]:
# Specifying input variables
X_cols = num_feat + cat_feat

In [1]:
# Preprocessing for numerical data
# Replace missing numerical variables with 0
numerical_transformer = SimpleImputer(strategy='constant', fill_value = 0)

In [1]:
# Preprocessing for categorical data
# Replaces missing categorical variables with mode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [1]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_feat),
        ('cat', categorical_transformer, cat_feat)
    ])

In [1]:
# specifying target variable
y_train = train.SalePrice
y_train

In [1]:
X_train = train[X_cols]
X_train

In [1]:
X_test = test[X_cols]
X_test

## Modelling & Predicting

In [1]:
"""
n_estimators: The model creates decision trees, this parameter specifies the number of trees (default = 100). Higher = more granular, but greater chance of overfitting.

max_depth = The maximum tree depth (if confused, look up decision trees)

learning_rate: Used in gradient descent, controls how much the models weights change in response to errors. Too low = overfitting, too high = underfitting.

subsample: How much of the training data to randomly sample. 0.7 = 70% of training data is randomly sample each iteration.

seed: A Random Seed, set to a value to be able to replicate the same random numbers each time. Useful for testing, can observe changes.

early_stopping_rounds: Each time the model iterates it either gets better or it doesn't. In our case if it iterates 5 times and doesn't improve, the model stops training. It helps prevent overfitting.

eval_set: Selects your evaluation data. The model runs on the training data and evaluates how accurately it makes predictions during training.

Verbose: If you set it to True, it prints the evaluation metric at each boosting stage.



A lot of parameter tuning is trial and error, these are the best settings that I happened to find. It could be possible to iteratively find the optimal parameters but it would take a considerable amount of time and you may end up overfitting your model, which would result in it performing poorly when used on other data outside of the training and tesing sets.
"""

model = XGBRegressor(n_estimators = 3460,
                     max_depth = 3,
                     learning_rate = 0.01,
                     subsample = 0.7,
                     seed=1,
                     early_stopping_rounds=5,
                     eval_set=[(X_train, y_train)],
                     verbose=False)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, 
                y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

## Evaluating Results

In [1]:
# Validate model score
# Do this to compare changes
scores = cross_val_score(my_pipeline, X_train, y_train)
print("Model Cross-val Score: ", scores.mean())

### Most Important Features

In [1]:
n_top = 30
# Combining column names and column scores
zipped = zip(X_train.columns, my_pipeline['model'].feature_importances_)
df = pd.DataFrame(zipped, columns = ["feature", "value"])

# Sort the features by the absolute value of their coefficient
df["abs_value"] = df["value"].apply(lambda x: abs(x))
df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
df = df.sort_values("abs_value", ascending=False)

fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature",
            y="value",
            data=df.head(n_top),
            palette=df.head(n_top)["colors"])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top features", fontsize=25)
ax.set_ylabel("Coefficient", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)

In [1]:
# Creates a dataframe with our results for submission
output = pd.DataFrame({'Id': test_id,
                       'SalePrice': preds})
output["SalePrice"] = np.expm1(output["SalePrice"])
output.to_csv('submission.csv', index=False)