In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pandas_profiling import ProfileReport
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [None]:
import pandas as pd
sample_submission = pd.read_csv("../input/house-prices-dataset/sample_submission.csv")
test = pd.read_csv("../input/house-prices-dataset/test.csv")
train = pd.read_csv("../input/house-prices-dataset/train.csv")

In [None]:
train.head(5)

In [None]:
test.head(5)

> Separating target variable from train dataset

In [None]:
target=pd.DataFrame(train['SalePrice'])

In [None]:
train_df=train.drop(['SalePrice'],axis=1)

*Lets have a look into avaialble features*

In [None]:
train.info()

*There are many categorical as well as numerical features in the dataset*

*Lets combine train and test datasets so that we can do data processing easily in one shot*

In [None]:
comb_df=train_df.append(test)

In [None]:
comb_df.shape

> *Identifying list and number of numerical and categorical*
* numerical features

In [None]:
num_features=[]

In [None]:
cat_features=[]

In [None]:
for col in comb_df.columns:
    if(comb_df[col].dtypes!='object'):
        num_features.append(col)
    else:
        cat_features.append(col)

*List of numerical features*

In [None]:
print("Total numerical features",len(num_features))

In [None]:
print(num_features)

*List of categorical features*

In [None]:
print("Total number of ctegorical features",len(cat_features))

In [None]:
print(cat_features)

*Considering huge number of features we have, we can think of reducing/dropping columns which are of not much use. Starting point can be columns which have most of missing values.We can drop columns where missing data percentage is more than 50%*.
Lets use pandas profiling to get all the basic details.

In [None]:
ProfileReport(comb_df)

*If you observe the count of categorical and numerical fetures in above report, it doesn't match with our analysis.Numerical count is 34 whereas ours was 37. I guess there are 3 such variables which have been incorrectly identified as numeric.Lets identify them

In [None]:
comb_df[num_features].head()

*It seems MSSubClass , OverallQual , OverallCond are more of object data types not numrical.Lets convert them into object*

In [None]:
for col in ['MSSubClass','OverallQual','OverallCond']:
    comb_df[col]=comb_df[col].astype('object')

*Lets findout new list of numeric and categorical features*

In [None]:
num_features=[]

In [None]:
cat_features=[]

In [None]:
for col in comb_df.columns:
    if(comb_df[col].dtypes!='object'):
        num_features.append(col)
    else:
        cat_features.append(col)

In [None]:
print(len(num_features))

In [None]:
print(len(cat_features))

*Now counts match with profile report*

*Lets take help of profile report to identify the features having missing values more than 50%.We can drop these columns from our data set.Also we can drop ID column, doesnt seem to have any contribution*

In [None]:
col_drop=['Id','Alley','Fence','MiscFeature','PoolQC']

In [None]:
comb_df=comb_df.drop(col_drop,axis=1)

**Let's use Profile Report to identify few more columns which we can get rid of thus reducing the feature number**
* 3SsnPorch is filled with almost one value i.e. there is no variance in terms of values and one value has dominated - drop
* Condition2 is filled with almost one value i.e. there is no variance in terms of values and one value has dominated - drop
* LowQualFinSF is filled with almost one value i.e. there is no variance in terms of values and one value has dominated - drop
* MiscVal is filled with almost one value i.e. there is no variance in terms of values and one value has dominated - drop
* PoolArea
* Utilities
* We can insert a new column called Age which will be difference between YearBuilt and YearSold. Age can have an impact on selling price. We can drop the other 3 columns -YearBuilt / YrSold / YearRemodAdd


In [None]:
comb_df['Age']=comb_df['YrSold']-comb_df['YearBuilt']

In [None]:
comb_df=comb_df.drop(['3SsnPorch','Condition2','LowQualFinSF','MiscVal','PoolArea','Utilities','YearBuilt','YrSold','YearRemodAdd'],axis=1)

In [None]:
comb_df.shape

**We have managed to bring down the feature number from 81 to 67**

**Let's impute the missing values in our dataset.We will use Iterative Imputer for numerical features.For categorical, we can update NaNs as "Unknown"**

> Before that, lets refresh our latest list of numerical and categorical features

In [None]:
num_features=[]

In [None]:
cat_features=[]

In [None]:
for col in comb_df.columns:
    if(comb_df[col].dtypes!='object'):
        num_features.append(col)
    else:
        cat_features.append(col)

In [None]:
print(len(num_features))

In [None]:
print(len(cat_features))

> Impute missing values in numerical features

In [None]:
comb_df_num=comb_df[num_features]

In [None]:
imputer=IterativeImputer()

In [None]:
comb_df_num_imp=pd.DataFrame(imputer.fit_transform(comb_df_num))

In [None]:
comb_df_num_imp.columns=comb_df_num.columns

In [None]:
comb_df_num_imp.index=comb_df_num.index

**Imputing Missing Values in Categorical Features**

In [None]:
comb_df_cat=comb_df[cat_features]

In [None]:
comb_df_cat=comb_df_cat.fillna('Unknown')


> Let's LabelEncode our categorical variables in order to use them during model implementation

In [None]:
le=LabelEncoder()

In [None]:
for col in comb_df_cat.columns:
    comb_df_cat[col]=le.fit_transform(comb_df_cat[col])
    

**Creating one dataset by concatenating imputed numerical and categorical features**

In [None]:
comb_new=pd.DataFrame()

In [None]:
comb_new=pd.concat([comb_df_cat,comb_df_num_imp],axis=1)

In [None]:
comb_new.head(2)

**Lets scale our dataset**
* Its not required to scale our target variable.Please refer to below discussion:
https://stats.stackexchange.com/questions/111467/is-it-necessary-to-scale-the-target-value-in-addition-to-scaling-features-for-re

In [None]:
scaler=StandardScaler()

In [None]:
comb_new_scaled=pd.DataFrame(scaler.fit_transform(comb_new))

**We had merged our Train and Test Dataset at the begining of this notebook in order to apply all preprocessing steps to both train and test datasets.As we know prepare for model building, let's split the dataset.**

In [None]:
comb_new_scaled_train=comb_new_scaled.iloc[:1460,:]

> Renaming the scaled train dataset to X and target variable to y for clarity

In [None]:
X=comb_new_scaled_train

In [None]:
y=target

In [None]:
comb_new_scaled_test=comb_new_scaled.iloc[1460:,:]

> Above scaled test data will be used to predict and submit our result.Renaming it to test_data

In [None]:
test_data=comb_new_scaled_test

**Lets use our X and y datasets to create training and vaidation datasets for model implementation and accuracy tests**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

**We will be using XGBRegressor and LGBMRegressor for predicting house prices**

> Lets start with XGBRegressor

In [None]:
XGB = XGBRegressor(max_depth=3,learning_rate=0.1,n_estimators=1000,reg_alpha=0.001,reg_lambda=0.000001,n_jobs=-1,min_child_weight=3)

In [None]:
XGB.fit(X_train,y_train)

> Let's check the XGB model performance on our train and test data set

> train score

In [None]:
print(XGB.score(X_train,y_train))

> test score

In [None]:
print(XGB.score(X_test,y_test))

> The difference between train and test score suggest that there is variance in our model.We might need more hyperparameter tuning. I will work on this in few days.

**Predicting house price for test_data **

In [None]:
y_pred = pd.DataFrame( XGB.predict(test_data))

In [None]:
y_pred

**I will work on Parameter Tuning and LightGBM part**
Thank you.