In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this data science project we will predict the sales price of houses. In order to make this project successful, we will carry out the following steps:
1) Investigate the data types; int, objects, etc.
2) The number of features with null values
3) Try to fill these features with null values
4) Identify significant features by exploring the correlation with sales price, as well as trends in the data

In [None]:
train_filepath = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
train_data = pd.read_csv(train_filepath)

test_filepath = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_filepath)

In [None]:
train_data.head(20)

In [None]:
#Fill variables with NaN - train_data

train_data['LotFrontage'] = train_data['LotFrontage'].fillna(0)
train_data['Alley'] = train_data['Alley'].fillna("No alley")
train_data['MasVnrType'] = train_data['MasVnrType'].fillna("Unknown")
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(0)
train_data['BsmtQual'] = train_data['BsmtQual'].fillna("No basement")
train_data['BsmtCond'] = train_data['BsmtCond'].fillna("No basement")
train_data['BsmtExposure'] = train_data['BsmtExposure'].fillna("No basement")
train_data['BsmtFinType1'] = train_data['BsmtFinType1'].fillna("No basement")
train_data['BsmtFinType2'] = train_data['BsmtFinType2'].fillna("No basement")
train_data['Electrical'] = train_data['Electrical'].fillna("Unknown")
train_data['FireplaceQu'] = train_data['FireplaceQu'].fillna("No fireplace")
train_data['GarageType'] = train_data['GarageType'].fillna("No garage")
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(0)
train_data['GarageFinish'] = train_data['GarageFinish'].fillna("No garage")
train_data['GarageQual'] = train_data['GarageQual'].fillna("No garage")
train_data['GarageCond'] = train_data['GarageCond'].fillna("No garage")
train_data['PoolQC'] = train_data['PoolQC'].fillna("No pool")
train_data['Fence'] = train_data['Fence'].fillna("No fence")
train_data['MiscFeature'] = train_data['MiscFeature'].fillna("None")

In [None]:
#New features train dataset
train_data['Pool'] = train_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
train_data['Years_since_construction'] = train_data['YrSold'] - train_data['YearRemodAdd']
train_data['Fireplace'] = train_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
train_data['Garage'] = train_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
train_data['Fence'] = train_data['Fence'].apply(lambda x: 1 if x != "No fence" else 0)

In [None]:
#Fill variables with NaN - test_data

test_data['LotFrontage'] = test_data['LotFrontage'].fillna(0)
test_data['Alley'] = test_data['Alley'].fillna("No alley")
test_data['MasVnrType'] = test_data['MasVnrType'].fillna("Unknown")
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(0)
test_data['BsmtQual'] = test_data['BsmtQual'].fillna("No basement")
test_data['BsmtCond'] = test_data['BsmtCond'].fillna("No basement")
test_data['BsmtExposure'] = test_data['BsmtExposure'].fillna("No basement")
test_data['BsmtFinType1'] = test_data['BsmtFinType1'].fillna("No basement")
test_data['BsmtFinType2'] = test_data['BsmtFinType2'].fillna("No basement")
test_data['Electrical'] = test_data['Electrical'].fillna("Unknown")
test_data['FireplaceQu'] = test_data['FireplaceQu'].fillna("No fireplace")
test_data['GarageType'] = test_data['GarageType'].fillna("No garage")
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(0)
test_data['GarageFinish'] = test_data['GarageFinish'].fillna("No garage")
test_data['GarageQual'] = test_data['GarageQual'].fillna("No garage")
test_data['GarageCond'] = test_data['GarageCond'].fillna("No garage")
test_data['PoolQC'] = test_data['PoolQC'].fillna("No pool")
test_data['Fence'] = test_data['Fence'].fillna("No fence")
test_data['MiscFeature'] = test_data['MiscFeature'].fillna("None")

test_data['MSZoning'] = test_data['MSZoning'].fillna("RL")
test_data['Utilities'] = test_data['Utilities'].fillna("AllPub")
test_data['Exterior1st'] = test_data['Exterior1st'].fillna("VinylSd")
test_data['Exterior2nd'] = test_data['Exterior2nd'].fillna("VinylSd")
test_data['BsmtFinSF1'] = test_data['BsmtFinSF1'].fillna(444)
test_data['BsmtFinSF2'] = test_data['BsmtFinSF2'].fillna(47)
test_data['BsmtUnfSF'] = test_data['BsmtUnfSF'].fillna(567)
test_data['TotalBsmtSF'] = test_data['TotalBsmtSF'].fillna(1057)
test_data['BsmtFullBath'] = test_data['BsmtFullBath'].fillna(1)
test_data['BsmtHalfBath'] = test_data['BsmtHalfBath'].fillna(0)
test_data['KitchenQual'] = test_data['KitchenQual'].fillna("TA")
test_data['Functional'] = test_data['Functional'].fillna("Typ")
test_data['GarageCars'] = test_data['GarageCars'].fillna(2)
test_data['GarageArea'] = test_data['GarageArea'].fillna(473)
test_data['SaleType'] = test_data['SaleType'].fillna("WD")

In [None]:
#New features test dataset
test_data['Pool'] = test_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
test_data['Years_since_construction'] = test_data['YrSold'] - train_data['YearRemodAdd']
test_data['Fireplace'] = test_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
test_data['Garage'] = test_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
test_data['Fence'] = test_data['Fence'].apply(lambda x: 1 if x != "No fence" else 0)

In [None]:
corr_data = train_data.copy()
object_columns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 
               'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 
               'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [None]:
encoder = LabelEncoder()

for col in object_columns:
    corr_data[col] = encoder.fit_transform(corr_data[col].astype(str))

In [None]:
#correlation_overview = corr_data.corr()
#correlation_sales = correlation_overview['SalePrice']
#corr_columns_1 = correlation_sales[correlation_sales > 0.05].index
#corr_columns_1 = corr_columns_1.to_list()
#correlation_sales[correlation_sales > 0.00]

In [None]:
#corr_columns_2 = correlation_sales[correlation_sales < -0.05].index
#corr_columns_2 = corr_columns_2.to_list()
#correlation_sales[correlation_sales < -0.10]

In [None]:
#corr_columns = corr_columns_1 + corr_columns_2
#corr_columns.remove('SalePrice')

In [None]:
x_features = corr_data.columns
x_features = x_features.to_list()
x_features.remove('SalePrice')
x_features.remove('YrSold')
x_features.remove('Id')
y_data = corr_data['SalePrice']
feature_selection = []
features = ['YrSold']
scores = [60000]

for feature in x_features:
    features.append(feature)
    x_data = corr_data[features]
    model = XGBRegressor(random_state=0, booster='gbtree', eta=0.1, max_depth=8)
    score = cross_val_score(model, x_data, y_data, cv=5, scoring='neg_mean_absolute_error')
    final_score = (-1 * score).mean()
    scores.append(final_score)
    length_list = len(scores)
    if final_score < scores[(length_list - 2)]:
        feature_selection.append(feature)
    else:
        features.remove(feature)
        scores.remove(final_score)

#Je zou hier bij else remove(feature) kunnen doen
df_0 = pd.DataFrame({'Features': features, 'Score': scores})
df_0.plot.bar(figsize=(30,10), title="Feature selection process")

In [None]:
print(feature_selection)

In [None]:
#Preparation for model experimentation
model_experimentation_dataset = train_data.copy()
model_experimentation_dataset_x = model_experimentation_dataset[feature_selection]
model_experimentation_dataset_x = pd.get_dummies(model_experimentation_dataset_x)
model_experimentation_dataset_y = model_experimentation_dataset['SalePrice']

In [None]:
#Experimentation eta
eta_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
value = []
score = []

for a in eta_list:
    model_1 = XGBRegressor(random_state=0, booster='gbtree', eta=a, max_depth=1)
    scores_1 = cross_val_score(model_1, model_experimentation_dataset_x, model_experimentation_dataset_y, cv=5, scoring='neg_mean_absolute_error')
    final_score_1 = (-1 * scores_1).mean()
    value.append(a)
    score.append(final_score_1)

df = pd.DataFrame({'Value': value, 'Score': score})

In [None]:
print(df)

In [None]:
df_min = df.min()
best_eta = df_min.Value

In [None]:
max_depth_list = range(1, 21, 1)
value_1 = []
score_1 = []

for a in max_depth_list:
    model_1 = XGBRegressor(random_state=0, booster='gbtree', eta=best_eta, max_depth=a)
    scores_1 = cross_val_score(model_1, model_experimentation_dataset_x, model_experimentation_dataset_y, cv=5, scoring='neg_mean_absolute_error')
    final_score_1 = (-1 * scores_1).mean()
    value_1.append(a)
    score_1.append(final_score_1)

df_1 = pd.DataFrame({'Value': value_1, 'Score': score_1})

In [None]:
print(df_1)

In [None]:
df_1_min = df_1.min()
best_max_depth = df_1_min.Value
best_max_depth = best_max_depth.astype('int64')

In [None]:
x_train = train_data[feature_selection]
x_train = pd.get_dummies(x_train)
y_train = train_data['SalePrice']

x_test = test_data[feature_selection]
x_test = pd.get_dummies(x_test)

In [None]:
missing_cols = set(x_train.columns) - set(x_test.columns)
for c in missing_cols:
    x_test[c] = 0

x_test = x_test[x_train.columns]

In [None]:
#model = XGBRegressor(random_state=0, booster='gbtree', eta=0.1, max_depth=8)
model = XGBRegressor(random_state=0, booster='gbtree', eta=best_eta, max_depth=best_max_depth)
model.fit(x_train, y_train)
prediction = model.predict(x_test)
test_data['SalePrice'] = prediction

In [None]:
submission_features = ['Id', 'SalePrice']
submission_data = test_data[submission_features].sort_values(by=['Id'])
submission_data.to_csv('submission.csv', index=False)