In [None]:
#importing all the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector
from statsmodels.stats.outliers_influence import variance_inflation_factor
import os # accessing directory structure
pd.set_option('display.max_columns',None)   #code to dispaly all the columns in the dataframe
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.shape
#21613 rows , 21 columns

In [None]:
df.info()

In [None]:
df[~df.apply(np.isreal).any(1)]    #checking for any non real value in the dataset
#there are no non real values in any column

In [None]:
#extrating just the year from the date column as we will be using only the year
df['new_date']=df['date'].str[:4]

In [None]:
#dropping the date column as we made a new_date column which has just the year
df.drop(['date','id'],axis=1,inplace=True)

In [None]:
df['new_date']=df['new_date'].astype('int')

In [None]:
df.dtypes

In [None]:
df['age']=np.NaN   #adding a new column 'age' to identify how old is the house

In [None]:
for i,j in enumerate(df['yr_renovated']):
    if(j==0):
        df['age'][i]=df['new_date'][i]-df['yr_built'][i]
    else:
        df['age'][i]=df['new_date'][i]-df['yr_renovated'][i]
        
#calculating how old the house is and storing it in the age column

In [None]:
# as we have used the yr_renovated , yr_built,new_date column to find the age of the house so we drop these column as these are of no use
df.drop(['yr_built','yr_renovated','new_date'],axis=1,inplace=True)

In [None]:
#we will not be using the zipcode,lat,long columns
df.drop(['lat','zipcode','long'],axis=1,inplace=True)

In [None]:
#checking the distribution of all the variables
for i in df.columns:
    sns.distplot(df[i])
    plt.show()

In [None]:
df.skew()

In [None]:
#Almost all the columns have skewness
#But sqft_lot15,sqft_lot are highly right skewed

In [None]:
plt.figure(figsize=(15,15))
sns.pairplot(df,diag_kind='kde')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
# sqft_living have a positive linear relationship with the price
# sqft_above have a positive linear relationship with the price

In [None]:
#sqft_living15,sqft_above,grade,sqft_living,bathrooms seems to have correlation

In [None]:
#checking for outliers in the dataset
for i in df.columns:
    sns.boxplot(df[i])
    plt.show()

In [None]:
#The bedroom column has some outliers
#The bathroom column has some outliers
#The sqft_living has outliers
#The sqft_lot column has lot of outliers
#The floor column has no outliers
#The grade column has many otliers
#The sqft_above has lot of outliers
#The sqft_basement has lot of outliers
#The sqft_living15 has lot of outliers
#The sqft_lot15 has lot of outliers
#The age column has no outliers

### Building a base model with all the features

In [None]:
X=df.drop('price',axis=1)
y=df['price']

In [None]:
vif_df=pd.DataFrame()   #making a dataframe for the vif of all the columns

In [None]:
vif_df['Columns']=X.columns
vif_df['VIF']=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]

In [None]:
vif_df

In [None]:
#Bedroom,bathroom,floors,condition,grade,sqft_living have high multicolinearity
#sqft_living,sqft_above,sqft_basement have very high multicolinearity

In [None]:
#splitting data into test set and train set
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.25,random_state=10)

In [None]:
xtrain.shape

In [None]:
ytrain.shape

In [None]:
xtest.shape

In [None]:
ytest.shape

In [None]:
#performing scaling, transformation on the training set and then building a Linear Regression Model
pipe=Pipeline((
('scale',StandardScaler()),
('transform',PowerTransformer(method='yeo-johnson')),
('lr',LinearRegression())
))
pipe.fit(xtrain,ytrain)
pipe.score(xtest,ytest)
#the performance of the model is not good

In [None]:
#building a linear regression model
pipe=Pipeline((
('lr',LinearRegression()),
))
pipe.fit(xtrain,ytrain)
lr_score=pipe.score(xtest,ytest)
#the model performs better without scaling and transformation

In [None]:
#building a lasso regression model
pipe=Pipeline((
('lasso',Lasso()),
))
pipe.fit(xtrain,ytrain)
lasso_score=pipe.score(xtest,ytest)
#no improvement in the model with lasso model as well

In [None]:
#building a ridge regression model
pipe=Pipeline((
('lasso',Ridge()),
))
pipe.fit(xtrain,ytrain)
ridge_score=pipe.score(xtest,ytest)
#no improvement in the model with ridge model as well

In [None]:
#building a Decision tree regression model
pipe=Pipeline((
('pt',PowerTransformer()),
('dt',DecisionTreeRegressor()),
))
pipe.fit(xtrain,ytrain)
dt_score=pipe.score(xtest,ytest)
#no improvement in the model with Decision tree model as well

In [None]:
#building a RandomForest regression model
pipe=Pipeline((
('pt',PowerTransformer()),
('rf',RandomForestRegressor()),
))
pipe.fit(xtrain,ytrain)
rf_score=pipe.score(xtest,ytest)
#the model performed better RandomForest model as well

In [None]:
#building a GradientBossting regression model
pipe=Pipeline((
('pt',PowerTransformer()),
('gb',GradientBoostingRegressor()),
))
pipe.fit(xtrain,ytrain)
gb_score=pipe.score(xtest,ytest)
#we can see that there is an increase in the performance

In [None]:
#building a AdaBoost regression model
pipe=Pipeline((
('adaboost',AdaBoostRegressor()),
))
pipe.fit(xtrain,ytrain)
adgb_score=pipe.score(xtest,ytest)
#the performace degraded with adaBoost madel

In [None]:
#building a K-nn regression model
pipe=Pipeline((
('adaboost',KNeighborsRegressor()),
))
pipe.fit(xtrain,ytrain)
knn_score=pipe.score(xtest,ytest)
#the performace degraded with K-nn madel

In [None]:
l1=['LinerRegression','Lasso','Ridge','DecisionTree','RandomForest','GradientBoost','AdaBoost','K-nn']
l2=[lr_score,lasso_score,ridge_score,dt_score,rf_score,gb_score,adgb_score,knn_score]

In [None]:
score_df=pd.DataFrame({'Models':l1,'Score':l2})

In [None]:
score_df

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(score_df['Models'],score_df['Score'])
plt.xticks(rotation=90)
plt.show()

In [None]:
#as we see from the graph random forest and gradient bossting performs better than other and both have almost equal score
#so we perform cross-validation to find the best out of the two

In [None]:
#now we perform the k-fold cross validation on RandomForestAlgorithm
randomforest_score=cross_val_score(estimator=RandomForestRegressor(),X=X,y=y,cv=10,scoring='r2')

In [None]:
randomforest_score

In [None]:
np.mean(randomforest_score)
#mean score for RandomforestRegressor

In [None]:
#now we perform the k-fold cross validation on GradientBoostingRegressorAlgorithm
gradientboosting_score=cross_val_score(estimator=GradientBoostingRegressor(),X=X,y=y,cv=10,scoring='r2')

In [None]:
gradientboosting_score

In [None]:
np.mean(gradientboosting_score)
#mean score for RandomforestRegressor

In [None]:
#Out of the RandomForestRegressor and GradiantBoostingRegressor, RandomForestRegressor performs better so we tune the hyperparameters for better result

In [None]:
#finding the best hyperparameters so as to increase the accuracy of the model
params={'n_estimators':[10,20,50,100,200,500],'max_depth':[2,5,8,9,12]}
grid=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=10,scoring='r2',n_jobs=-1)

In [None]:
grid.fit(xtrain, ytrain)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
#final model with RandomForestRegressor

In [None]:
X1=df.drop(['price'],axis=1)
y1=df['price']

In [None]:
xtrain1,xtest1,ytrain1,ytest1=train_test_split(X1,y1,test_size=0.25,random_state=10)

In [None]:
pipe=Pipeline((
('pt',PowerTransformer()),
('rf',RandomForestRegressor(n_estimators=200,max_depth=12)),
))
pipe.fit(xtrain1,ytrain1)
pipe.score(xtest1,ytest1)

In [None]:
#Final model with RandomForestRegressor with 74.48% accuracy

In [None]:
df.head()

In [None]:
price_pred=pipe.predict(xtest1)

In [None]:
df=pd.DataFrame({'price_actual':ytest1,'price_predicted':price_pred})
df.head()