In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# magic word for producing visualizations in notebook
%matplotlib inline
plt.style.use('seaborn')
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### HOUSE PRICES PREDICTION
![Main pic](https://www.yourmoney.com/wp-content/uploads/sites/3/2022/02/house-prices-scaled.jpg)

### The Machine Learning Project Checklist
1. Frame the problem
2. Get the data
3. Explore the data
4. Prepare the data
5. Model the data
6. Fine-tune the models
7. Present the solution
8. Launch the ML system



## 1. Frame the problem
#### Goal
It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable. 

#### Metric
Submissions are evaluated on RMSLE (Root Mean Squared Log Error), Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.
 
 
## 2. Get the data
   Kaggle already done this step for us, and splitted the date for train & test dataframe
    
    
## 3. Explore the data (Train data)
   we must create a test set, put it aside, and never look at it.

In [None]:
# read the data
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

# inspect first 5 rows of the train dataset
train_df.head()

In [None]:
# Print a concise summary of the tain DataFrame.
train_df.info()

In [None]:
# check the total number of NaNs in the train df
train_df.isna().sum().sum()

In [None]:
# check the total number of NaNs in each column in the train df
train_df[
    train_df.columns[
        train_df.isna().any()
    ]
].isna().sum()

In [None]:
# plot the NaNs distribution in the df
train_df.isna().sum().plot(kind='hist')
plt.xlabel("Number of NaNs")
plt.title('NaNs distribution', fontsize=18);

find-out the outliers percent in terms of the proportion of values that are missing.

In [None]:
# we will remove columns with grater than 800 NaNs
threshold = 800
print(f"Columns with grater than {threshold} NaNs, {round((threshold/train_df.shape[0]) * 100)}% of it's values are NaNs.")

In [None]:
# Which columns we need to remove from the dataset?
outliers = train_df.isna().sum()[train_df.isna().sum() > threshold]
outliers

In [None]:
# check if there is any duplicates in the df
train_df.duplicated().sum()

In [None]:
# inspect the descriptive statistics of the numeric features
train_df.describe()

We had lots of Nans, and the values have very large scale.

In [None]:
# plot the distribution of the numeric features
train_df.hist(bins=50, figsize=(20,16));

Most of the data are skewed, and if a feature has a skewed distribution, applying a logarithm can help normalize it.  
We should try taking the log of the skewed numeric features.

In [None]:
# plot the correlation between the numeric features

# make a bigger plot
# plt.figure(figsize=(19,15))

# Mask the upper part of the heatmap
# mask = np.triu(train_df.corr())

# plot the heatmap using Seaborn
sns.heatmap(train_df.corr(), annot=False, cmap='icefire');

In [None]:
# find out the top correlated features 
train_df.corr().unstack().sort_values(ascending=False).drop_duplicates()[:24]

we should try removing Multicollinearity  features (Remove strongly correlated columns)

In [None]:
# find out the correlation between the features and the Target
train_df.corrwith(train_df['SalePrice']).sort_values(ascending=False)

In [None]:
# plot the correlation with the target
train_df.corrwith(train_df['SalePrice']).sort_values(ascending=False)[1:].plot(kind='bar')
# plot a vertical line on where we want to remove the non correlated columns
plt.vlines(26.5, -0.1, 0.8, colors='red')
plt.title("Correlations with the Target 'SalePrice'");

We should try removing non correlated columns

## 4. Prepare the data

You should always create a test set and set it aside before
inspecting the data closely.

In [None]:
# split the features X and the target y
X = train_df.drop(columns=['SalePrice'])
y = train_df.SalePrice

In [None]:
# save the columns we want too remove to use this variable later in the pipeline
less_than_0_corr = train_df.corrwith(train_df['SalePrice'])[train_df.corrwith(train_df['SalePrice']) < 0].index.to_list()
cols_to_remove = list(outliers.index) + less_than_0_corr

### the numeric values

In [None]:
# select the numeric features
num_df = X.select_dtypes(include='number')

# inspect first 5 rows of the numeric features
num_df.head()

Now let's build a pipeline for preprocessing the numerical features/attributes:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

num_pipeline = Pipeline([
        # When the data is skewed, it is good to consider using
        # the median value for replacing the missing values. 
        ('imputer', SimpleImputer(strategy="median", add_indicator=True)),
        # MinMaxScaler is useful when the distribution isn't Normal or Gaussian.
        ('scaler', MinMaxScaler()),
    ])

### Categorical values

In [None]:
# select the categorical features
cat_df = X.select_dtypes(include='object')

# inspect first 5 rows of the categorical features
cat_df.head()

Now let's build a pipeline for preprocessing the categorical features/attributes:

In [None]:
from sklearn.preprocessing import OrdinalEncoder

cat_pipeline = Pipeline([
        # we will fill the NaNs with the mode
        ('imputer', SimpleImputer(strategy="most_frequent", add_indicator=True)),
        # the features has order meaning
        ('encoder', OrdinalEncoder()),
    ])

### Now let's build a pipeline for preprocessing all the attributes:

In [None]:
from sklearn.compose import ColumnTransformer

# keep only the wanted columns
num_attribs = list(set(num_df) - set(cols_to_remove))
cat_attribs = list(set(cat_df) - set(cols_to_remove))

# combine the numeric & categorical pipelines
preprocessor = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ], 
    remainder='drop' # the remainder features will be dropped [default]
)

# prepare the df form the ML models by calling the preprocessor
X_prepared = preprocessor.fit_transform(X)
# inspect the number of rows & columns of the prepared df
X_prepared.shape

## 5. Model the data

Training and Evaluating on the Training Set

In [None]:
# for reproducability
random_state = 10    # 10 for MESSI

#### LinearRegression model

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

# Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
log_y = np.log(y)
lin_reg.fit(X_prepared, log_y)

In [None]:
from sklearn.metrics import mean_squared_error

# create a finction that scores the model for later usage
def score_model(model):
    y_pred = model.predict(X_prepared)
    model_mse = mean_squared_error(log_y, y_pred)
    return np.sqrt(model_mse)

In [None]:
# score the LinearRegression model
score_model(lin_reg)

#### DecisionTreeRegressor model

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=random_state)
tree_reg.fit(X_prepared, log_y)

In [None]:
# score the DecisionTreeRegressor model
score_model(tree_reg)



The `DecisionTreeRegressor` model badly Overfit the data, we will skip it.


#### RandomForestRegressor model

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=random_state)
rf.fit(X_prepared, log_y)

In [None]:
# score the RandomForestRegressor model
score_model(rf)

#### XGBRegressor model

In [None]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor(seed=random_state)
xgb_reg.fit(X_prepared, log_y)

In [None]:
# score the XGBRegressor model
score_model(xgb_reg)

#### Better Evaluation Using Cross-Validation

**cross-validation** allows you to get not only an estimate of the performance of your
model, but also a measure of how precise this estimate is (i.e., its standard
deviation). 

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_reg, X_prepared, log_y,
                         scoring="neg_mean_squared_error", cv=10)
# Scikit-Learn’s cross-validation features expect a utility function 
# (greater is better) rather than a cost function (lower is better)
xgb_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(xgb_scores)

In [None]:
scores = cross_val_score(rf, X_prepared, log_y,
                         scoring="neg_mean_squared_error", cv=10)
# Scikit-Learn’s cross-validation features expect a utility function 
# (greater is better) rather than a cost function (lower is better)
rf_scores = np.sqrt(-scores)
display_scores(rf_scores)

`RandomForestRegressor` has much better scores, so we will stick with it.

### Fine-Tune Your Model

#### Random Search

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# random_grid = {'bootstrap': [True, False],
#               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
#               'max_features': ['auto', 'sqrt'],
#               'min_samples_leaf': [1, 2, 4],
#               'min_samples_split': [2, 5, 10],
#               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

# rnd_search = RandomizedSearchCV(rf, param_distributions=random_grid,
#                                 n_iter=100, cv=5, scoring='neg_mean_squared_error', 
#                                 random_state=random_state, verbose=1, n_jobs=-1)



# rnd_search.fit(X_prepared, log_y)

Let's look at the score of each hyperparameter combination tested during the random search:

In [None]:
# cvres = rnd_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [None]:
# rnd_search.best_estimator_

In [None]:
# rnd_search.best_score_

### Grid Search

We will try to play arround the best `RandomSearchCV` values

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {'n_estimators': np.arange(1500, 1700, 50),
#              'min_samples_split': [1, 2, 3],
#              'min_samples_leaf': [1, 2],
#              'max_features': ['sqrt'],
#              'max_depth': [18, 20, 22],
#              'bootstrap': [False]}

# # train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
# grid_search = GridSearchCV(rf, param_grid, cv=5,
#                            scoring='neg_mean_squared_error',
#                            return_train_score=True
#                           ,n_jobs = -1, verbose = 1)
# grid_search.fit(X_prepared, log_y)

Let's look at the score of each hyperparameter combination tested during the grid search:

In [None]:
# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [None]:
# final_model = grid_search.best_estimator_
# final_model

In [None]:
final_model = RandomForestRegressor(bootstrap=False, max_depth=18, max_features='sqrt',
                                    n_estimators=1650, random_state=10)
final_model.fit(X_prepared, log_y)

In [None]:
# score the best model
score_model(final_model)

### A full pipeline with both preparation and prediction

In [None]:
full_pipeline = Pipeline([
    ("preparation", preprocessor),
    ("model", final_model)
    ])

full_pipeline.fit(X, log_y)

## 7. Evaluate Your System on the Test Set
Submitting the data

In [None]:
# read the df
X_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

# save the model's predictions
# the competition loss is RMSLE So, we need to log-transform y to train 
# and exp-transform the predictions
final_predictions = np.exp(full_pipeline.predict(X_test))

In [None]:
# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'Id': X_test.Id, 'SalePrice': final_predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

**For any suggestions, please let me know in the comments! Thanks.**