In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install feature_engine

In [None]:
# Importing all the necessary libraries
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
pd.pandas.set_option('display.max.columns',None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the train dataset

df=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
X_test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df.head()

In [None]:
X_test.shape

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

# Starting With EDA

# Missing Features

In [None]:
# Finding out the missing Variables and putting them into a list

missing_features=[features for features in df.columns if df[features].isnull().sum()>=1]
len(missing_features)

In [None]:
# How many missing values are there in each feature (Percentage Values%)

for features in missing_features:
    print(features,'has :',np.round((df[features].isnull().mean())*100,4),'% missing Values')

In [None]:
# Establishing the relation between Missing Values and the dependent feature Price
fig=plt.figure(figsize=(14,100))

a=len(missing_features) # Number of rows in the subplot
b=2                     # Number of plots in each row
c=1                     # Figure Counter

# Making a copy of the dataset
data=df.copy()  

for features in missing_features:
     # Converting all the null values to 0 & 1
    data[features]=np.where(data[features].isnull(),1,0)
    
    plt.subplot(a,b,c)
    # Ploting the median prices for each feature with respect to missing values.
    data.groupby(features)['SalePrice'].median().plot.bar(color= 'teal')
    plt.xticks(rotation=0) # Using Xticks so that the labels are not inverted as default.
    c=c+1

fig.tight_layout()    
plt.show()

> **From the Above figures we can easily derive that there is clear relationship between sale price and missing features.**
   * Price is higher for missing values in the below features
                  1. Lot Frontage
                  2. Alley
                  3. MasVnrType
                  4. MasVnrArea
                  5. Electrical
                  6. Fence
                  7. Misc Features
   * Price is Lower for missing values in the below features
                  1. Bsmt Quality
                  2. Bsmt Condition
                  3. Bsmt Exposure
                  4. Bsmt FinType 1
                  5. Bsmt FinType 2
                  6. Fireplace Quality
                  7. Garage Type
                  8. Garage YrBuilt
                  9. Garage Finish
                 10. Garage Quality
                 11. Garage Condition
                 12. Pool QC

# Numerical Features

In [None]:
# We will find all the numerical features in the dataset
numerical_features = [features for features in df.columns if df[features].dtype !='O' and features!='Id']
numerical_features

> *We have a few Temporal Variables in the Numerical features (All the Year features i.e Datetime Features)*
  **lets Extract the Temporal Variables**

# Temporal Variables

In [None]:
# Temporal Variables
year_feature = [features for features in numerical_features if 'Yr' in features or 'Year' in features]
year_feature

In [None]:
# Establishing relationship between Year Features and dependent feature Sale Prcie
fig=plt.figure(figsize=(14,22))

a=len(year_feature)   # No. of rows in subplot
b=2                   # No. of plots in each row
c=1                   # Figure Counter

data=df.copy()  # Making a dataset copy

for features in year_feature:
    plt.subplot(a,b,c)
    
    # Plotting median prices for each features grouping by Year
    data.groupby(features)['SalePrice'].median().plot(color='green')
    plt.xticks(rotation=0)  # Using Xticks so that the label in in proper orientation
    c=c+1 # Incrementing figure counter

fig.tight_layout()
plt.show()

> **Conclusion**
* As we can see there is an incremental growth in price as the years keep increasing where as with year sold the price is decreasing.
       1. This may occur for one reason that is the price of the houses are decreasing as the houses are getting older.
 **We will try to verify the same in the next cell.**

In [None]:
# Lets plot the difference between each Year Feature with YrSold
fig=plt.figure(figsize=(14,22))

a=len(year_feature)   # No. of rows in subplot
b=2                   # No. of plots in each row
c=1                   # Figure Counter

data=df.copy()  # Making a dataset copy

for features in year_feature:
    if features != 'YrSold':
        # Finding the difference between Year feature and Year Sold
        data[features] = data['YrSold'] - data[features]
        plt.subplot(a,b,c)
        data.groupby(features)['SalePrice'].median().plot(color='black')
        c = c+1

fig.tight_layout()
plt.show()

> **This proves our finding that the Sale Price is decreasing as the age of the houses are increasing**

# Discrete Features (Numerical Features)

In [None]:
# We are considering 25 as the threashold value of discrete featuers.
# Lets get all the discrete features in a list

discrete_features = [features for features in numerical_features if(len(df[features].unique())<25) and features not in year_feature]
discrete_features

In [None]:
# Exploring relationship between discrete features and dependent feature (SalePrice)
fig=plt.figure(figsize=(14,100))

a=len(discrete_features)   # No. of rows in subplot
b=2                        # No. of plots in each row
c=1                        # Figure Counter

data=df.copy()  # Making a dataset copy

for features in discrete_features:
    plt.subplot(a,b,c)
    # Plotting median prices for each features grouping by discrete features
    data.groupby(features)['SalePrice'].median().plot(kind='bar',color='red')
    plt.xticks(rotation=0)  # Using Xticks so that the label in in proper orientation
    c = c + 1 # Incrementing figure counter
    
    plt.subplot(a,b,c)
    sns.boxplot(x = data[features], palette = 'cubehelix')  # To check for outliers
    c = c + 1  # Incrementing figure counter
     

fig.tight_layout()
plt.show()

> **There is a clear monotonic relationship between between some features with sale price and the others have messy relationship. All the features do have a lot of outliers except MoSold which has be taken care of in the feature engineering part.**

# Continious Features (Numerical Features)

In [None]:
# Lets get all the continious Features in a list

continious_features = [features for features in numerical_features if features not in discrete_features + year_feature + ['Id']]
continious_features

In [None]:
# Lets find the distribution of the continious features.
fig=plt.figure(figsize=(14,100))

a=len(continious_features)   # No. of rows in subplot
b=2                        # No. of plots in each row
c=1                        # Figure Counter

data=df.copy()  # Making a dataset copy

for features in continious_features:
    plt.subplot(a,b,c)
    # Distribution of the continious features
    sns.histplot(x = data[features], kde = True , palette = 'viridis')
    c = c + 1 # Incrementing figure counter
    
    plt.subplot(a,b,c)
    sns.boxplot(x = data[features], color = 'gold')  # To check for outliers
    c = c + 1  # Incrementing figure counter
     

fig.tight_layout()
plt.show()

> **The data is right skewed with all the features having outliers. We need to perform log normal transformation on these features.**

In [None]:
# Log Normal Transformation on the continious features
# Plotting them with dependent features as well
fig = plt.figure(figsize=(14,100))

a = len(continious_features)    # No. of rows in the subplot
b = 2                           # No. of columns in each row
c = 1                           # Figure Counter

data = df.copy()

for features in continious_features:
    if (0 in data[features].unique()):            # Explicitly ignoring the 0 values 
        pass
    else:
        plt.subplot(a,b,c)                      
        data[features] = np.log(data[features])                              # Log Normal Transformation
        sns.histplot(x = data[features] , kde=True, color = 'orange')        # Plotting the distributions of the features
        c = c+1
        
        plt.subplot(a,b,c)
        data['SalePrice'] = np.log(data['SalePrice'])                                   # Log Normal Transforation of the dependent feature
        sns.scatterplot(x = data[features] , y = data['SalePrice'] , color = 'teal')    # Plotting dependent feature with log normally distributed continious feature
        c = c+1

fig.tight_layout()
plt.show()

> **After applying log normal distribution we can conclude that there is somewhat of a linear relationship between continious features and Sale Price**

# Categorical Features 

In [None]:
# Lets get all the categorical features in a list

categorical_features = [features for features in df.columns if df[features].dtype == 'O']
categorical_features

In [None]:
df[categorical_features].head()

In [None]:
# Lets plot the relationship between categorical features and dependent feature
fig=plt.figure(figsize=(14,150))

a=len(categorical_features)   # No. of rows in subplot
b=2                           # No. of plots in each row
c=1                           # Figure Counter

data=df.copy()  # Making a dataset copy

for features in categorical_features:
    plt.subplot(a,b,c)
    # Plotting the median values with Sale Price
    data.groupby(features)['SalePrice'].median().plot(color='maroon',kind='bar')
    plt.xticks(rotation=0)
    c = c+1

fig.tight_layout()
plt.show()

> **As we can see in the above plots there are few features with clear relationships and few with messy realtionships. We will use all of above collected data in feature engineering.**

In [None]:
# Exploring how many subclasses are there in each of the categorical features

for features in categorical_features:
    print('There are {} subclasses in {}'.format(len(df[features].unique()),features))
    

> **So there are a few features which has more then 10 subclasses whereas the rest have around 3-8 subclasses which can be eaisly dealt with OneHotEncoding**

# Featuring Engineering

In [None]:
temp = X_test.copy()
temp.head()

In [None]:
# Lets start with handling the missing values starting with missing values in categorical features

missing_categorical = [features for features in df.columns if df[features].isnull().sum()>=1 and df[features].dtype == 'O']
missing_categorical_temp = [features for features in temp.columns if temp[features].isnull().sum()>=1 and temp[features].dtype == 'O']

for features in missing_categorical:
    print('{} : {} % Null Values'.format(features,np.round(df[features].isnull().mean(),4)))

In [None]:
# Replacing missing values in categorial_features with new label "missing"

def replace_cat(dataset,missing_categorical):
    data=dataset.copy()
    data[missing_categorical]=data[missing_categorical].fillna("missing")
    return data


df = replace_cat(df,missing_categorical)
temp = replace_cat(temp,missing_categorical_temp)

In [None]:
df[missing_categorical].isnull().sum()

In [None]:
temp[missing_categorical].isnull().sum()

In [None]:
# Missing numerical features

missing_numerical = [features for features in numerical_features if df[features].isnull().sum()>1]
missing_numerical_temp = [features for features in temp.columns if temp[features].isnull().sum()>1 and temp[features].dtype !='O']

In [None]:
# Function to impute missing numerical features

def replace_num(dataset,missing):
    
    for features in missing:
        median = dataset[features].median()
        dataset[features] = dataset[features].fillna(median)
    return dataset

In [None]:
# Calling the impute function for train data
df = replace_num(df,missing_numerical)

df[missing_numerical].isnull().sum()

In [None]:
# Calling the impute function for test data
temp = replace_num(temp,missing_numerical_temp)

temp[missing_numerical_temp].isnull().sum()

In [None]:
missing_temp = [features for features in temp.columns if (temp[features].isnull().sum()>=1)]
temp[missing_temp]

In [None]:
temp = replace_num(temp,missing_temp)

temp[missing_temp].isnull().sum()

In [None]:
df.head()

In [None]:
temp.head()

In [None]:
# Dropping Id column
df = df.drop('Id',axis=1)

In [None]:
# Dropping Id column
ID = temp['Id']
temp = temp.drop('Id',axis=1)

# Log Normal Transformation

In [None]:
log_transform = [features for features in numerical_features if features not in year_feature]
log_transform

In [None]:
# Log Normal Transformnation of numerical features

def log_normal(dataframe,log_transform):
    for i in log_transform:
        if(0 in dataframe[i].unique()):
            pass
        else:
            dataframe[i] = np.log(dataframe[i])
    return dataframe
    

In [None]:
df = log_normal(df,log_transform)

In [None]:
log_transform_temp = [features for features in numerical_features if features not in year_feature and features!= 'SalePrice']

In [None]:
log_transform_temp

In [None]:
# Function to do log normal transformation

def log_normal_temp(dataframe,log_transform_temp):
    for i in log_transform_temp:
        if(0 in dataframe[i].unique()):
            pass
        else:
            dataframe[i] = np.log(dataframe[i])
    return dataframe

In [None]:
temp = log_normal_temp(temp,log_transform_temp)

In [None]:
df.head()

In [None]:
temp.head()

# Handling Rare Categorical Features

In [None]:
# Making a copy of the dataset

data = df.copy()
data.head()

In [None]:
from feature_engine.encoding import RareLabelEncoder
encoder = RareLabelEncoder(tol = 0.01,n_categories=9,replace_with='Rare')

In [None]:
# We will find the features that have rare categorical features or features who has sub categories less than 1% frequency.

data = encoder.fit_transform(data)
for features in ['Neighborhood', 'Condition1', 'Exterior1st', 'Exterior2nd' ,'SaleType']:
    data[features + "_rare"] = np.where(data[features]=="Rare",1,0) 
    

In [None]:
# We will find the features that have rare categorical features or features who has sub categories less than 1% frequency.

temp = encoder.fit_transform(temp)
for features in ['Neighborhood', 'Condition1', 'Exterior1st', 'Exterior2nd' ,'SaleType']:
    temp[features + "_rare"] = np.where(temp[features]=="Rare",1,0) 
    

In [None]:
data.head()

In [None]:
df = data

In [None]:
temp.head()

# Temporal Variables

In [None]:
# As seen in earlier EDA prices of the house directly depends on age of the house
year = [features for features in year_feature if features != 'YrSold']
year

In [None]:
for features in year:
    df[features] = df['YrSold'] - df[features]

In [None]:
for features in year:
    temp[features] = temp['YrSold'] - temp[features]

In [None]:
df[year].head()

In [None]:
temp[year].head()

# One Hot Encoding & Label Encoding

In [None]:
# So during EDA we found that there were many features which has an exponential relationship with dependent variable Sale Price, 
# So we will label Encode those features and rest would be done using One Hot Encoder. With this strategy we can give our model a clear relationship between the features.

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

lebel_encoder = LabelEncoder()
One_encoder = OneHotEncoder(drop='first')

In [None]:
df.head()

In [None]:
# categorical features which only have Two sub categories
two_subcat = [features for features in categorical_features if len(df[features].unique())==2]
df[two_subcat]

In [None]:
# Getting all the elements which has only two unique categories inside them
two_subcat_temp = [features for features in categorical_features if len(temp[features].unique())==2]
temp[two_subcat_temp]

In [None]:
# Categorical features which have two sub categories are convetred into numerical counter-part.
data = df.copy()

for features in two_subcat:
    lis = data[features].unique()
    data[features] = np.where(data[features]==lis[0],1,0)

In [None]:
# Doing the same for the test data
for features in two_subcat_temp:
    lis_temp = temp[features].unique()
    temp[features] = np.where(temp[features]==lis_temp[0],1,0)

In [None]:
# Label Encoding rest of the categorical features
for c in categorical_features:
    label_encoder = LabelEncoder() 
    label_encoder.fit(list(data[c].values)) 
    data[c] = label_encoder.transform(list(data[c].values))

In [None]:
# Doing the same for test data
for c in categorical_features:
    label_encoder = LabelEncoder() 
    label_encoder.fit(list(temp[c].values)) 
    temp[c] = label_encoder.transform(list(temp[c].values))

In [None]:
# Giving back the dataset after manupulation
df = data

In [None]:
df[categorical_features].head()

In [None]:
df.sample(10)

In [None]:
# Dropping Sale Price from X_train
X_train = df.drop('SalePrice',axis=1)

In [None]:
# Getting all the features except ID and SALE PRICE because we need them as column names after feature scaling
features = [features for features in df.columns if features not in ['SalePrice','Id']]

In [None]:
X_train.shape

In [None]:
temp.shape

# Feature Scaling

In [None]:
# Using MinMaxScaler for scaling the features
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)

In [None]:
# Converting the array returned by sclaer to dataframe
temp = pd.DataFrame(scaler.transform(temp),columns=features)

In [None]:
# Converting the array returned by sclaer to dataframe for X_train
X_train = pd.concat([df['SalePrice'].reset_index(drop=True),pd.DataFrame(scaler.transform(X_train),columns=features)],axis=1)

In [None]:
X_train.head()

In [None]:
# Getting all the dependent feture values
y_train = X_train['SalePrice'].values

In [None]:
# Dropping Sale Price from tain data
X_train = X_train.drop('SalePrice',axis=1)

# Feature Selection

In [None]:
# Using Lasso for feature selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005,random_state=0))
feature_sel_model.fit(X_train,y_train)

In [None]:
# Getting all the important features
feature_sel_model.get_support()

In [None]:
# A total of 23 features are selected
selected = X_train.columns[(feature_sel_model.get_support())]
len(selected)

In [None]:
# All the important features
selected

In [None]:
# Getting all the selected features in X_train
X_train = X_train[selected]

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
# Getting all the selected features in X_test variables
X_test = temp[selected]

In [None]:
X_test.shape

In [None]:
X_test.head()

In [None]:
# We will use cross validation for testing models
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import make_scorer,r2_score

def test_model(model, X_train=X_train, y_train=y_train):
    cv = KFold(n_splits=10,shuffle=True,random_state=45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model,X_train,y_train,cv=cv,scoring=r2)
    score = [r2_val_score.mean()]
    return score

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
Lr = LinearRegression(normalize=True, n_jobs=-1)

In [None]:
test_model(Lr)

# Ridge Regression

In [None]:
Rdg = Ridge(alpha = 1e-4, normalize=True)
test_model(Rdg)

# Lasso Regression

In [None]:
Las = Lasso(alpha = 1e-4, normalize=True)
test_model(Las)

# Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
DCR = DecisionTreeRegressor(max_depth=5)

In [None]:
test_model(DCR)

# Simple Vector Machine

In [None]:
from sklearn.svm import SVR
Svm = SVR(kernel='rbf')

In [None]:
test_model(Svm)

# LightGradiantBoosting

In [None]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
test_model(lgbm)

# XGBoost Regressor

In [None]:
import xgboost
xgb = xgboost.XGBRegressor(bbooster='gbtree',random_state=0)

In [None]:
test_model(xgb)

# Building The Lasso Model

In [None]:
Las.fit(X_train,y_train)
y_pred = np.exp(Las.predict(X_test)).round(2)

In [None]:
y_pred

# HyperParameter Tuning

# SVM

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
params = {'kernel': ['rbf'],
         'gamma': [1,0.1,0.01,0.001,0.0001,0.0002,0.0003,0.0004],
         'C': [0.1,1,10,20,100,1000],
         'epsilon': [1,0.2,0.1,0.01,0.001,0.02,0.003,0.004,0.005,0.006,0.007,0.008,0.0001]}

In [None]:
rand_search = RandomizedSearchCV(Svm , param_distributions=params , n_jobs=-1 , cv=11)
rand_search.fit(X_train,y_train)
rand_search.best_score_

In [None]:
Svm2 = SVR(kernel='rbf',C= 20, epsilon= 0.008, gamma=0.0003)
test_model(Svm2)

# XGBoost

In [None]:
xgb3 = xgboost.XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

test_model(xgb3)

In [None]:
xgb3.fit(X_train,y_train)
y_pred_xgb3 = np.exp(xgb3.predict(X_test)).round(2)

In [None]:
submit_test1 = pd.concat([ID,pd.DataFrame(y_pred_xgb3)],axis=1)
submit_test1.columns = ['Id','SalePrice']

In [None]:
submit_test1

In [None]:
# Submitting the output
submit_test1.to_csv('submission.csv',index=False)

# Give this Notebook an Upvote if it was anyways hepful to you, 
# Thank You