#      House Prices Prediction
![https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTdDMRs8OdF9DcJGPxxn8pNohVvLMGipNlzaQ&usqp=CAU](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTdDMRs8OdF9DcJGPxxn8pNohVvLMGipNlzaQ&usqp=CAU)




In [None]:
import os
import sys
import time
import random
import logging
import typing as tp
from pathlib import Path
from contextlib import contextmanager
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
import xgboost as xgb
from xgboost.sklearn import XGBRegressor


%matplotlib inline

#Load more packages
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [None]:
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
DATA = INPUT / "house-prices-advanced-regression-techniques"
WORK = ROOT / "working"

for path in DATA.iterdir():
    print(path.name)

In [None]:
train = pd.read_csv(DATA / "train.csv")
test = pd.read_csv(DATA / "test.csv")
sample_sub= pd.read_csv(DATA / "sample_submission.csv")
print("train: {}, test: {}, sample sub: {}".format(
    train.shape, test.shape, sample_sub.shape
))

In [None]:
train.head().T

# Prepocessing

In [None]:
labels = {}
for col in train.select_dtypes(exclude = np.number).columns.tolist():
    le = LabelEncoder().fit(pd.concat([train[col].astype(str),test[col].astype(str)]))   
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    labels [col] = le
print('Categorical columns:', list(labels.keys()))

# Check Missing Data



In [None]:
print(f'Percent of Nans in Train Data : {round(train.isna().sum().sum()/len(train), 2)}')
print(f'Percent of Nans in Test  Data : {round(test.isna().sum().sum()/len(test), 2)}')

In [None]:
train = train.replace([np.inf, -np.inf], np.nan)
train= train.fillna(0)
train

In [None]:
test = test.replace([np.inf, -np.inf], np.nan)
test= test.fillna(0)
test

# Visualisation

In [None]:
#barplots showing the frequency of each category separated by label
plt.figure(figsize=[15,17])
fft=['Street',
          'Alley',
          'LotShape',
          'LandContour',
          'Utilities',
          'LotConfig',
          'LandSlope',
        'Neighborhood',
          ]
n=1
for f in fft:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='BldgType', edgecolor="black", alpha=0.7, data=train)
    sns.despine()
    plt.title("Countplot of {}  by SalePrice".format(f))
    n=n+1
plt.tight_layout()
plt.show()

In [None]:
#barplots showing the frequency of each category separated by label
plt.figure(figsize=[15,17])
fft1=[    'Condition1',
          'Condition2',
          'BldgType',
          'HouseStyle',
          'OverallQual',
          'OverallCond',
          'YearBuilt',
          'YearRemodAdd',
          ]
n=1
for f in fft1:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='BldgType', edgecolor="black", alpha=0.7, data=train)
    sns.despine()
    plt.title("Countplot of {}  by SalePrice".format(f))
    n=n+1
plt.tight_layout()
plt.show()

In [None]:
#barplots showing the frequency of each category separated by label
plt.figure(figsize=[15,17])
fft2=[ 'RoofStyle',
          'RoofMatl',
          'Exterior1st',
          'Exterior2nd',
          'MasVnrType',
         'MasVnrArea',
           'ExterQual',
          'ExterCond',
          ]
n=1
for f in fft2:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='BldgType', edgecolor="black", alpha=0.7, data=train)
    sns.despine()
    plt.title("Countplot of {}  by SalePrice".format(f))
    n=n+1
plt.tight_layout()
plt.show()

In [None]:
#barplots showing the frequency of each category separated by label
plt.figure(figsize=[15,17])
fft3=[ 'Foundation',
          'BsmtQual',
          'BsmtCond',
          'BsmtExposure',
          'BsmtFinType1',
          'BsmtFinSF1',
          'BsmtFinType2',
          'BsmtFinSF2',
          ]
n=1
for f in fft3:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='BldgType', edgecolor="black", alpha=0.7, data=train)
    sns.despine()
    plt.title("Countplot of {}  by SalePrice".format(f))
    n=n+1
plt.tight_layout()
plt.show()

In [None]:
#barplots showing the frequency of each category separated by label
plt.figure(figsize=[15,17])
fft4=[  'PoolQC',
          'Fence',
          'MiscFeature',
          'MiscVal',
          'MoSold',
          'YrSold',
          'SaleType',
          'SaleCondition'
          ]
n=1
for f in fft4:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='BldgType', edgecolor="black", alpha=0.7, data=train)
    sns.despine()
    plt.title("Countplot of {}  by SalePrice".format(f))
    n=n+1
plt.tight_layout()
plt.show()

# Model

In [None]:
#Select feature column names and target variable we are going to use for training
features=['Id',
          'MSSubClass',
          'MSZoning',
          'LotFrontage',
          'LotArea',
          'Street',
          'Alley',
          'LotShape',
          'LandContour',
          'Utilities',
          'LotConfig',
          'LandSlope',
          'Neighborhood',
          'Condition1',
          'Condition2',
          'BldgType',
          'HouseStyle',
          'OverallQual',
          'OverallCond',
          'YearBuilt',
          'YearRemodAdd',
          'RoofStyle',
          'RoofMatl',
          'Exterior1st',
          'Exterior2nd',
          'MasVnrType',
          'MasVnrArea',
          'ExterQual',
          'ExterCond',
          'Foundation',
          'BsmtQual',
          'BsmtCond',
          'BsmtExposure',
          'BsmtFinType1',
          'BsmtFinSF1',
          'BsmtFinType2',
          'BsmtFinSF2',
          'BsmtUnfSF',
          'TotalBsmtSF',
          'Heating',
          'HeatingQC',
          'CentralAir',
          'Electrical',
          '1stFlrSF',
          '2ndFlrSF',
          'LowQualFinSF',
          'GrLivArea',
          'BsmtFullBath',
          'BsmtHalfBath',
          'FullBath',
          'HalfBath',
          'BedroomAbvGr',
          'KitchenAbvGr',
          'KitchenQual',
          'TotRmsAbvGrd',
          'Functional',
          'Fireplaces',
          'FireplaceQu',
          'GarageType',
          'GarageYrBlt',
          'GarageFinish',
          'GarageCars',
          'GarageArea',
          'GarageQual',
          'GarageCond',
          'PavedDrive',
          'WoodDeckSF',
          'OpenPorchSF',
          'EnclosedPorch',
          '3SsnPorch',
          'ScreenPorch',
          'PoolArea',
          'PoolQC',
          'Fence',
          'MiscFeature',
          'MiscVal',
          'MoSold',
          'YrSold',
          'SaleType',
          'SaleCondition']

target = 'SalePrice'

In [None]:
#This is input which our classifier will use as an input.
train[features].head(10)

# Xgboost

In [None]:
params = {
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'lambda': 7.610705234008646, 
    'alpha': 0.0019377246932580476, 
    'colsample_bytree': 0.5, 
    'subsample': 0.7, 
    'learning_rate': 0.012, 
    'max_depth': 20, 
    'random_state': 24, 
    'min_child_weight': 229,
    'random_state':42,

}


model = XGBRegressor(**params)
model.fit(train[features],train[target])


# Prediction

In [None]:
#Make predictions using the features from the test data set
predictions = model.predict(test[features])

predictions

In [None]:
#Create a  DataFrame
submission = pd.DataFrame({'Id':test['Id'],'SalePrice':predictions})
                        

#Visualize the first 10 rows
submission.head(10)

In [None]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

# Kolmogorov-Smirnov Test


The Kolmogorov-Smirnov Goodness of Fit Test (K-S test) compares your data with a known distribution and lets you know if they have the same distribution. Although the test is nonparametric — it doesn’t assume any particular underlying distribution — it is commonly used as a test for normality to see if your data is normally distributed.It’s also used to check the assumption of normality in Analysis of Variance.




<img src='https://i.stack.imgur.com/4nNN2.png' width='700'>









### The purpose of using Kolmogorov-Smirnov Test  to check whether train and test sets are significantly different.

In [None]:
train.drop(train[['Id', 'SalePrice']], axis=1, inplace=True)
test.drop(test[['Id']], axis=1, inplace=True)

Perform KS-Test for each feature from train/test. Draw its distribution. Count features based on statistics.


In [None]:

hypothesisnotrejected = []
hypothesisrejected = []

for col in train.columns:
    statistic, pvalue = ks_2samp(train[col], test[col])
    if pvalue>=statistic:
        hypothesisnotrejected.append(col)
    if pvalue<statistic:
        hypothesisrejected.append(col)
        
    plt.figure(figsize=(8,4))
    plt.title("Kolmogorov-Smirnov test for train/test\n"
              "feature: {}, statistics: {:.5f}, pvalue: {:5f}".format(col, statistic, pvalue))
    sns.kdeplot(train[col], color='blue', shade=True, label='Train')
    sns.kdeplot(test[col], color='green', shade=True, label='Test')

    plt.show()

In [None]:
len(hypothesisnotrejected), len(hypothesisrejected)


# Hypothesis rejected

In [None]:
print(hypothesisrejected)


# Hypothesis not rejected

In [None]:
print(hypothesisnotrejected)



## References: 
* https://www.statisticshowto.com/kolmogorov-smirnov-test/
* https://www.kaggle.com/bearstrikesback/adversarial-validation-plus-ks-test/notebook
* Chakravarti, Laha, and Roy, (1967). Handbook of Methods of Applied Statistics, Volume I, John Wiley and Sons, pp. 392-394.
* Ruppert, D. (2004). Statistics and Finance: An Introduction. Springer Science and Business Media.
* Stephens M.A. (1992) Introduction to Kolmogorov (1933) On the Empirical Determination of a Distribution. In: Kotz S., Johnson N.L. (eds) Breakthroughs in Statistics. Springer Series in Statistics (Perspectives in Statistics). Springer, New York, NY