In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
concat_df = pd.concat([train_df, test_df])

#### Set the max_columns to 81 so that all the columns are visible

In [None]:
pd.set_option('max_columns', 81)
pd.set_option('max_rows', 50)

# **Exploratory Data Analysis**

In [None]:
concat_df.reset_index(drop=True, inplace=True)

In [None]:
concat_df

In [None]:
f'concat_df dimension: {concat_df.shape}'

#### Count the na values

In [None]:
missing_df = pd.DataFrame({'Missing Count':concat_df.isna().sum()[concat_df.isna().any() == True]})
missing_df

#### Visualize the na values

In [None]:
fig = px.imshow(concat_df.isna())
fig.show()

#### Check duplicated data

In [None]:
f'Total duplicated data: {concat_df.duplicated().sum()}'

#### Print all categorical and numeric columns

In [None]:
cat_cols = concat_df.select_dtypes(object).columns.tolist()
num_cols = concat_df.select_dtypes(exclude = object).columns.tolist()

print(f'Categorical Features --- ({len(cat_cols)})\n{cat_cols}\n')
print(f'Numeric Features --- ({len(num_cols)})\n{num_cols}')

#### General information about the data

In [None]:
concat_df.info()

In [None]:
concat_df.describe()

# **Visualization**

### Columns Correlation

In [None]:
corr = concat_df.corr()

fig = px.imshow(corr)
fig.show()

In [None]:
fig = px.histogram(concat_df.SalePrice, title='House Price Distribution')
fig.show()

In [None]:
fig = px.box(x=concat_df.SalePrice, title='House Price Distribution')
fig.show()

# **Feature Engineering**

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [None]:
class FE:
    def __init__(self, df):
        self.df = df
        
        
    # Drop any column whose na value is over 50% of the total data
    def drop_columns(self):
        columns = list()
        for column, value in zip(self.df.isna().sum().index.tolist(), self.df.isna().sum().values.tolist()):
            if value > (len(self.df) / 2):
                columns.append(column)
        
        self.df.drop(columns, axis=1, inplace=True)
        return self.df
    
    
    # Fill na rows with the mode value (data that appears the most)
    def fill_na_cat(self):
        for column in self.df.select_dtypes(object):
            self.df[column].fillna(self.df[column].mode()[0], inplace=True)
        
        return self.df
    
    
    # Fill na rows with imputer
    def impute(self):
        imputer = KNNImputer(n_neighbors=2)
        
        try:
            impute_df = self.df.select_dtypes(exclude=object).drop('SalePrice', axis=1)
            imputed_numeric = imputer.fit_transform(impute_df)
            
            numeric_df = pd.DataFrame(imputed_numeric, columns=impute_df.columns)
            categorical_df = self.df.select_dtypes(object)
            sale_price_df = self.df['SalePrice']
            self.df = pd.concat([numeric_df, categorical_df, sale_price_df], axis=1)
        except:
            impute_df = self.df.select_dtypes(exclude=object)
            imputed_numeric = imputer.fit_transform(impute_df)
            
            numeric_df = pd.DataFrame(imputed_numeric, columns=impute_df.columns)
            categorical_df = self.df.select_dtypes(object)
            self.df = pd.concat([numeric_df, categorical_df], axis=1)
            
        return self.df
    
    
    # Convert categorical values into numeric
    def convert_values(self):
        encoder = LabelEncoder()
        for column in self.df.select_dtypes(object).columns:
            self.df[column] = encoder.fit_transform(self.df[column])
            
        return self.df
        
    
fe = FE(concat_df.copy())
fe.drop_columns()
fe.fill_na_cat()
fe.impute()
cleaned_df = fe.convert_values()

In [None]:
cleaned_df.head()

In [None]:
fig = px.imshow(cleaned_df.isna(), title='Missing Values')
fig.show()

# **Data Preprocessing**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = cleaned_df[~cleaned_df.SalePrice.isna()][cleaned_df.columns[1:-1]]
y = cleaned_df[~cleaned_df.SalePrice.isna()][cleaned_df.columns[-1]]

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=1)

# **Models Creation**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error

In [None]:
class Models:
    r2_scores = list()
    rmsle_scores = list()
    rmse_scores = list()
    mae_scores = list()
    model_names = list()
    
    def __init__(self, model, model_name):
        self.model = model
        self.model_name = model_name
        Models.model_names.append(self.model_name)
        
    def predict(self):
        model = self.model
        model.fit(train_x, train_y)
        prediction = model.predict(test_x)
        
        r2 = r2_score(test_y, prediction)
        rmse = np.sqrt(mean_squared_error(test_y, prediction))
        mae = mean_absolute_error(test_y, prediction)
        rmsle = np.sqrt(mean_squared_log_error(test_y, prediction))
        
        Models.r2_scores.append(r2)
        Models.rmse_scores.append(rmse)
        Models.mae_scores.append(mae)
        Models.rmsle_scores.append(rmsle)

        self.print_result(r2, rmse, mae, rmsle)
        
    def print_result(self, r2, rmse, mae, rmsle):
        print(f'R2: {r2}')
        print(f'RMSE: {rmse}')
        print(f'MAE: {mae}')
        print(f'RMSLE: {rmsle}')

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = Models(LinearRegression(), 'Linear Regression')
model.predict()

### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

model = Models(Ridge(), 'Ridge Regression')
model.predict()

### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

model = Models(Lasso(), 'Lasso Regression')
model.predict()

### Support Vector Regressor

In [None]:
from sklearn.svm import SVR

model = Models(SVR(), 'Support Vector Regressor')
model.predict()

### XGB Regressor

In [None]:
from xgboost import XGBRegressor

model = Models(XGBRegressor(), 'XGB Regressor')
model.predict()

# **Models Performace**

In [None]:
performance_df = pd.DataFrame({'Model':Models.model_names, 'R2':Models.r2_scores, 'RMSE':Models.rmse_scores, 'MAE':Models.mae_scores, 'RMSLE':Models.rmsle_scores})
performance_df = performance_df.sort_values(by='RMSE').reset_index(drop=True)
performance_df

In [None]:
fig = make_subplots(rows=2, cols=2, vertical_spacing=0.3, subplot_titles=('RMSE', 'R2', 'MAE', 'RMSLE'))

fig.add_trace(
    go.Bar(y=performance_df.RMSE, x=performance_df.Model),
    row=1, col=1
)

fig.add_trace(
    go.Bar(y=performance_df.R2, x=performance_df.Model),
    row=1, col=2
)

fig.add_trace(
    go.Bar(y=performance_df.MAE, x=performance_df.Model),
    row=2, col=1
)

fig.add_trace(
    go.Bar(y=performance_df.RMSLE, x=performance_df.Model),
    row=2, col=2
)

fig.update_layout(height=700, title_text="Models Performances")

# **Feature Importance**
Since XGB Regressor has the best performace, we're going to check the feature importances from the XGB Regressor model

In [None]:
xgb = XGBRegressor()
xgb.fit(train_x, train_y)

features = train_x.columns.tolist()
importances = xgb.feature_importances_.tolist()

fi_df = pd.DataFrame({'Feature':features, 'Importance':importances}).sort_values(by='Importance')
fi_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

In [None]:
fig = px.bar(fi_df, x='Importance', y='Feature', title='Feature Importances', height=800)
fig.show()

# **Predicting Test Data**

In [None]:
from sklearn.preprocessing import StandardScaler

#### Clean test data

In [None]:
fe = FE(test_df.copy())
fe.drop_columns()
fe.fill_na_cat()
fe.impute()
test_df2 = fe.convert_values()
test_df2.Id = test_df.Id.apply(lambda x : int(x))

#### Clean train data

In [None]:
fe = FE(train_df.copy())
fe.drop_columns()
fe.fill_na_cat()
fe.impute()
train_df2 = fe.convert_values()
train_df2.Id = train_df2.Id.apply(lambda x : int(x))

#### Train and test data

In [None]:
train_x = train_df2[train_df2.columns[1:-1]].drop('FireplaceQu', axis=1)
train_y = train_df2.SalePrice
test_x = test_df2.copy().drop('Id', axis=1)

#### Creating model

In [None]:
xgb = XGBRegressor()
xgb.fit(train_x, train_y)
prediction = xgb.predict(test_x)

#### Result df

In [None]:
result_df = pd.DataFrame({'Id':test_df2.Id.values, 'SalePrice':prediction})
result_df

#### Submission

In [None]:
result_df.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")