In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<p style="font-size:20px; color:#bd431e; border-bottom:1px solid #bd431e;padding-bottom:10px">Table Of Contents</p>

<p>
    <a style="font-size:17px;color:black;text-decoration:none;" href="#INTRODUCTION">1. INTRODUCTION</a>
</p>
<p>
<a style="font-size:17px;color:black;text-decoration:none;" href="#IMPORTING_AND_CLEANING">2. FEATURE ENGINEERING</a>
</p>

<p style="font-size:20px; color:#bd431e; border-bottom:1px solid #bd431e;padding-bottom:10px" id="INTRODUCTION">1. INTRODUCTION</p>

<p style="font-size:16px;text-align:justify;">
    Analysis of the <span><a href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques">advanced regression house prices<a></span> dataset and build Machine Learning (ML) models for predicting the <span style="font-style:italic;font-weight:bold;">Sale Price</span> of a house. This is a supervised regression ML Let's start by importing the necessary Python Libraries.
</p

In [None]:
# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Sklearn and other libraries for building ML models
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Feature selection
from sklearn.feature_selection import RFE

# Various notebook related imports
import warnings
warnings.filterwarnings('ignore')

<p style="font-size:20px; color:#bd431e; border-bottom:1px solid #bd431e;padding-bottom:10px" id="IMPORTING_AND_CLEANING">2. FEATURE ENGINEERING</p>

<p style="font-size:16px;text-align:justify;">
    In this section, we import and clean the data to prepare it for use with the ML models that we will develop later on. The dataset is already split into <span style="font-style:italic; font-weight:bold;">train</span> and <span style="font-style:italic; font-weight:bold;">test</span> datasets. The test dataset does not contain the <span style="font-style:italic; font-weight:bold;">Sale Price</span> column that is the target that we have to predict.
</p

In [None]:
# Import the data for training into a Pandas DataFrame
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
# Import the data for test into a Pandas DataFrame
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
print(f"The train dataset has {train_data.shape[1]} columns and {train_data.shape[0]} records (rows).")
print(f"The test dataset has {test_data.shape[1]} columns and {test_data.shape[0]} records (rows).")

<p style="font-size:16px;text-align:justify;">
    The train and the test datasets are almost of equal size. Further, the train dataset contains both numerical and categorical data. We need to convert the categorical data to numerical before proceeding with the ML models.  
</p

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
features = train_data.columns[1:-1]
target = train_data.columns[-1]

<p style="font-size:18px; color:#de5c35;">2.1 Checking for nan values</p>

<p style="font-size:16px;text-align:justify;">
    Let's check for <span
style="font-style:italic; font-weight:bold;">nan values</span> in the <span style="font-style:italic; font-weight:bold;">train</span> dataset.
</p

In [None]:
def plotly_bar_plot(x, y, x_label=None, y_label=None, title=None, threshold=None):
    """
    Create plotly bar plots with a consistent style.
    Inputs
    ------
    x: The x-axis values
    y: The y-axis values
    x_label: The label of the x axis.
    y_label: The lable of the y axis.
    title: The title of the plot.
    Returns
    -------
    None
    """
    fig = go.Figure([
        go.Bar(x=x, y=y, marker_color="red", marker_line_color="black",),
    ])    
    fig.update_xaxes(showline=True, showgrid=False, linewidth=1, linecolor='#d6d6d6', mirror=True, title="Feature")
    fig.update_yaxes(showline=True, showgrid=False, linewidth=1, linecolor='#d6d6d6', mirror=True, title="Number of null values")
    fig.update_layout(template="plotly_white",width=1200, height=500, title="Columns with nan values")
    fig.show()

In [None]:
plotly_bar_plot(x = features, y=train_data[features].isnull().mean().sort_values(ascending=False))

In [None]:
def drop_nan_cols(df, threshold):
    """
    Drop (remove) columns in a Pandas DataFrame that have a percentage of nan values greater that the threshold
    Input
    -----
    df: (Pandas DataFrame) The DataFrame to check columns for nan values
    threshold: (int) The percentage of nan values (>=) over which the column is dropped from the DataFrame. 
                     e.g 50 means that columns with nan values over 50% is dropped 
    """
    for col_name, null_ in zip(list(df.columns), df.isnull().sum()):
        null_perc = (null_ / df.shape[0])*100
        if null_perc >= threshold:
            print(f"Dropping column with name {col_name} - Null value percentage: {null_perc} %")
            df.drop(col_name, axis='columns', inplace=True)

In [None]:
# Drop the columns with nan values more than 50%
drop_nan_cols(train_data, 50)
# Update the features after droping a few columns
features = train_data.columns[1:-1]

In [None]:
plotly_bar_plot(x = features, y=train_data[features].isnull().mean().sort_values(ascending=False))

<p style="font-size:16px;text-align:justify;">
    Removing from the list of the train_data columns the <span style="font-weight:bold;">Id</span> and the <span style="font-weight:bold;">SalePrice</span> we get the features list of the train dataset.
</p

<p style="font-size:18px; color:#de5c35;">2.2 Coverting categorical values to numerical</p>

<p style="font-size:16px;text-align:justify;">
    In this section we convert all categorical data to numerical in both the train and test datasets. 
</p

In [None]:
def cat_to_num(df1, df2=None, nan_replacement_value=None):
    """ 
    Check the columns of a dataframe for numerical or categorical values and convert categorical data to numerical
    The conversion involves corresponding an integer for every possible category. 
    """
    # Initially assign the nan_replacement_value to nan the values in the dataframe
    df1.fillna(0, inplace=True)
    bool_numeric_data_cols = df1.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())

    for b, col_name in zip(bool_numeric_data_cols, df1.columns):
        if not b:
            possible_classes = list(set(df1[col_name]))
            new_values = []
            for value in df1[col_name]:
                new_values.append(possible_classes.index(value))
            df1.drop(col_name, axis='columns', inplace=True)
            df1[col_name] = new_values
            
cat_to_num(train_data)
cat_to_num(test_data)

In [None]:
train_data.head()

<p style="font-size:20px; color:#bd431e; border-bottom:1px solid #bd431e;padding-bottom:10px">3. FEATURE SELECTION</p>

<p style="font-size:18px; color:#de5c35;">3.1 Recursive Feature Elimination (RFE)</p>

In [None]:
rfe = RFE(XGBRegressor(n_estimators=50), n_features_to_select=30, step=1)
selector = rfe.fit(train_data[features],train_data['SalePrice'])
selector.support_
reduced_features = []
for f, s in zip(features, selector.support_):
    if s:
        reduced_features.append(f)
print(reduced_features)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=20)
model.fit(train_data[features],train_data['SalePrice'])
print(model.feature_importances_)

<p style="font-size:20px; color:#bd431e; border-bottom:1px solid #bd431e;padding-bottom:10px">4. MODELING</p>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error


LR = LogisticRegression(solver='liblinear', max_iter=10000)
RF = RandomForestClassifier(max_depth=2)

print("Logistic Regression - Full Features")
scores_LR_full = cross_val_score(LR, train_data[features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
print(scores_LR_full)

# print("Logistic Regression - Reduced Features")
# scores_LR_reduced = cross_val_score(LR, train_data[reduced_features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
# print(scores_LR_reduced)

print("Random Forest - Full Features")
scores_RF_full = cross_val_score(RF, train_data[features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
print(scores_LR_full)

# print("Random Forest - Reduced Features")
# scores_RF_reduced = cross_val_score(RF, train_data[reduced_features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
# print(scores_RF_reduced)

print("XGBoost - Full Features")
scores_XGBoost_full = cross_val_score(XGBRegressor(n_estimators=100), train_data[features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
print(scores_XGBoost_full)

# print("XGBoost - Reduced Features")
# scores_XGBoost_reduced = cross_val_score(XGBRegressor(n_estimators=100), train_data[reduced_features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
# print(scores_XGBoost_reduced)

print("CatBoost - Full Features")
scores_CatBoost_full = cross_val_score(CatBoostRegressor(iterations=2, learning_rate=1, depth=2), train_data[features],train_data[target], scoring='neg_mean_squared_log_error', cv=5)
print(scores_CatBoost_full)

In [None]:
# Make Predictions
RF_model = RF.fit(train_data[features],train_data[target])
RF_pred = RF_model.predict(test_data[features])

In [None]:
from xgboost import XGBRegressor
scores = cross_val_score(XGBRegressor(n_estimators=50),
                         train_data[features],train_data[target],
                         scoring='neg_mean_squared_log_error', cv=10)
scores

In [None]:
XGBReg = XGBRegressor().fit(train_data[reduced_features],train_data[target])
XGB_pred = XGBReg.predict(test_data[reduced_features])


In [None]:
df_submit = pd.DataFrame()
df_submit["id"] = test_data["Id"]
df_submit["SalePrice"] = XGB_pred

In [None]:
df_submit.to_csv("sub_XGB.csv", index=False)