In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
nRowsRead = 1000 # specify 'None' if want to read whole file
# kc_house_data.csv has 21613 rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'kc_house_data.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
#calculating types of dtypes

In [None]:
df.dtypes

In [None]:
#extracting date form datetime

In [None]:
df['date']=df['date'].str[0:8]

In [None]:
df['date']

In [None]:
#converting into datetime for furthur analysis

In [None]:
df['date']=pd.to_datetime(df['date'],format="%Y/%m/%d")

In [None]:
#AFTER CONVERSION AGAIN CHECK THE DATATYPES AGAIN

In [None]:
df.dtypes

In [None]:
#CHECKING FOR ANY SYMBOLS OR SPECIAL CHARACTERISTICS

In [None]:
df[~df.applymap(np.isreal).all(1)]

In [None]:
sns.pairplot(df,diag_kind='kde')

In [None]:
#INFERENCES FROM THE PAIRPLOT

In [None]:
#price and #sqft living are almost linearly related
#price and #sqft above are almost linearly related
#sqft above and #sqft living are almost linearly related
#sqft living and sqft_basement

In [None]:
#CHECKING FOR OUTLIERS

In [None]:
df.plot.box(figsize=(20,10))
plt.xticks(rotation=45)
plt.show()

In [None]:
for i in df.iloc[:,2:]:
    df[i].plot.box()
    plt.show()

In [None]:
#FROM ABOVE WE CAN SAY THAT WATERFALL AND VIEW ARE CATEGORICAL COLUMNS
#BATHROOM HAS A VALUE OF 33 THAT CAN BE DROPPED WHICH IS A EXTREME OUTLIER
#OTHER COLUMNS HAS OUTLIERS TOO

In [None]:
#Checking for skewness

In [None]:
df.skew()

In [None]:
#sqft_lot,sqft_lot15 is highly right skewed data

In [None]:
#CHECKING FOR NULL VALUES

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
# EXTRACTING YEAR FROM THE DATE COLUMN AND CREATING ANOTHER COLUMN CONTAINING ONLY YEAR 

In [None]:
df['present']=df['date'].dt.year

In [None]:
df.head()

In [None]:
#CREATING A NEW COLUMN AGE TO CHECK THE HOW OLD THE HOUSE IS..FOR FURTHER ANALYSIS

In [None]:
df['age']=df['present']-df['yr_renovated']

In [None]:
for i in df['age'].index:
    if df['age'][i]>500:
        df['age'][i]=df['present'][i]-df['yr_built'][i]
   

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
#'grade' and 'sqrt' above are highly correlated to price but these are also related to other columns of the dataset
#sqrt_living is highly correlated to price

In [None]:
## STARTING BUILDING THE MODEL 

In [None]:
df1=df.drop(['yr_built','yr_renovated','zipcode','lat','long','date','id'],axis=1)
df1.head()

In [None]:
X=df1.drop('price',axis=1)
y=df1['price']

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif['columns']=X.columns
vif['vif']=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
vif

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.pipeline import Pipeline

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,train_size=0.75,random_state=1)
ss=StandardScaler()
xtrains=ss.fit_transform(xtrain)
xtests=ss.transform(xtest)

In [None]:
#present and grade are having maximum multicollinearity also,sqft_living,sqft_above,sqft_basement having multicollinearity
#inf 

In [None]:
pipe=Pipeline((
    
('lr',LinearRegression()),
))
pipe.fit(xtrain,ytrain)
a=pipe.score(xtest,ytest)
a

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('lr',LinearRegression())
))
pipe.fit(xtrain,ytrain)
pipe.score(xtest,ytest)

In [None]:
# using PowerTransformer the score is getting worse

In [None]:
#so via linear model highest score that we get is 0.64 
#lets try to improve that score via various models

In [None]:
#Regularizing the data
pipe=Pipeline((
('ss',StandardScaler()),
('lasso',Lasso())
))
pipe.fit(xtrain,ytrain)
b=pipe.score(xtest,ytest)
b

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('ridge',Ridge())
))
pipe.fit(xtrain,ytrain)
c=pipe.score(xtest,ytest)
c

In [None]:
#regularization is not helping much
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('svr',SVR())
))
pipe.fit(xtrain,ytrain)
d=pipe.score(xtest,ytest)
d

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('dtr',DecisionTreeRegressor())
))
pipe.fit(xtrain,ytrain)
e=pipe.score(xtest,ytest)
e

In [None]:
pipe=Pipeline((
('pt',PowerTransformer()),
('rfr',RandomForestRegressor())
))
pipe.fit(xtrain,ytrain)
f=pipe.score(xtest,ytest)
f

In [None]:
pipe=Pipeline((
('pt',PowerTransformer()),
('adr',AdaBoostRegressor())
))
pipe.fit(xtrain,ytrain)
g=pipe.score(xtest,ytest)
g

In [None]:
pipe=Pipeline((
('pt',PowerTransformer()),
('adr',GradientBoostingRegressor())
))
pipe.fit(xtrain,ytrain)
h=pipe.score(xtest,ytest)
h

In [None]:
l=[a,b,c,d,e,f,g,h]
l1=['LinearRegression','Lasso','Ridge','SVR','DecisionTreeRegressor','RandomForestRegressor','AdaBoostRegressor','GradientBoostingRegressor']
model=pd.DataFrame({'models':l1,'score':l})
model

In [None]:
plt.figure(figsize=(15,10))
model=model.sort_values(ascending=True,by='score')
sns.barplot(model['models'],model['score'])
plt.xticks(rotation=60)
plt.show()

In [None]:
# from here we can say gradient boosting and random forest is performing better we cando hyparameter tuning in that

In [None]:
from sklearn.model_selection import cross_val_score,GridSearchCV

In [None]:
rfr=RandomForestRegressor()
gb=GradientBoostingRegressor()

In [None]:
cross=cross_val_score(rfr,X,y,scoring='r2',cv=10,n_jobs=-1)
cross

In [None]:
cross.mean()

In [None]:
cross=cross_val_score(gb,X,y,scoring='r2',cv=10,n_jobs=-1)
cross

In [None]:
cross.mean()

In [None]:
# CV score of Random forest is much better so its hyperparameter tuning should be done

In [None]:
rfr=RandomForestRegressor(random_state=1)
param=[{'n_estimators':[20,50,100,500,800,1000],'max_depth':[2,4,5,7,8,10]}]
grid=GridSearchCV(estimator=rfr,param_grid=param,cv=8,n_jobs=-1)
grid.fit(xtrain,ytrain)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
#buildig model with best parameters

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,train_size=0.75,random_state=1)
rfr1=RandomForestRegressor(n_estimators=100,max_depth=10,random_state=1)
rfr1.fit(xtrain,ytrain)

In [None]:
rfr1.score(xtest,ytest)

In [None]:
#It is not improving score

In [None]:
#Now doing feature selection

In [None]:
X1=X.drop(['condition','present'],axis=1)
y=df1['price']

In [None]:
xtrain1,xtest1,ytrain1,ytest1=train_test_split(X1,y,train_size=0.75,random_state=1)

In [None]:
rfr=RandomForestRegressor(n_estimators=100,max_depth=10,random_state=1)
rfr.fit(xtrain1,ytrain1)

In [None]:
rfr.score(xtest1,ytest1)

In [None]:
#dropping features is not improving score,so we will stick to initial features

In [None]:
pipe=Pipeline((
('pt',PowerTransformer()),
('rfr',RandomForestRegressor())
))
pipe.fit(xtrain,ytrain)
ypred=pipe.predict(xtest)

In [None]:
l=list(ypred)
l1=list(ytest)
new_price=pd.DataFrame({'Actual_price':l1,'Predicted_price':l})
new_price.head(10)