In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')
df.shape

In [None]:
df.head()

In [None]:
df.info()

#### Few of the features are not identified with the right datatype, so changing them

In [None]:
df['waterfront'] = df['waterfront'].astype('object')
df['view'] = df['view'].astype('object')
df['condition'] = df['condition'].astype('object')
df['grade'] = df['grade'].astype('object')
df['zipcode'] = df['zipcode'].astype('object')

In [None]:
# date column should be dropped 
df = df.drop('date',axis=1)

In [None]:
df_cat = df.select_dtypes('object')
df_num = df.select_dtypes(np.number)

In [None]:
# Checking for null values
df.isnull().sum()

#### So, there are no null values in our dataset

In [None]:
df[df=='?'].count()

#### There are no null values in the form of '?' too.

In [None]:
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df.describe()

#### Inferences from describe:

* The target variable varies between 75000 and 77 lakh. The mean is greater than median, so it is right skewed. We can also observe that there are few luxury houses which have price more than 7 lakh, as we can see by comparing the 75th percentile and the maximum
* The number of bedrooms vary between 0 and 33. The mean and median are almost same. We can see that, the maximum and 75th percentile are very much different, so most of the houses have 3-4 bedrooms and luxury flats alone have in two digits.
* The number of bathrooms vary between 0 and 8. The mean and median are slightly  different, so the distribution should be skewed.
* sqft_living varies between 290 and 13540, so there are small and luxury houses in our dataset And, here too the mean and median are very much different, so the data is skewed.
* sqft_lot varies between 520 to 1 crore, it definitely confirms luxury houses and mansions in our dataset, the mean and median are different, so it is also skewed.
* Number of floors varies between 1 to 3.5 and the mean and median are almost equal
* sqft_above varies between 290 to 9410, and the mean and median are very much different So, the distribution is skewed.
* sqft_basement varies between 0 and 4820. So, very few houses have underground levels and few of them are small and few are big.
* yr_built varies between 1900 to 2015, so there are very old to new houses. But, most of them seem to be little  old. The mean and median are different here too.
* yr_renovated varies between 0 to 2015. So, many of the houses are not renovated and only a few of them have been renovated in recent times.
* sqft_living15 varies between 399 to 6210. The mean and median are highly different, so the distribution is skewed
* sqft_lot15 varies between 651 to 871200, which is clearly right skewed.

### Target Variable

In [None]:
df['price'].nunique()

In [None]:
plt.rcParams['figure.figsize'] = 18,5
fig,ax= plt.subplots(1,3)
sns.distplot(df['price'],ax=ax[0])
sns.boxplot(df['price'],ax=ax[1])
sns.violinplot(df['price'],ax=ax[2])
plt.show()

Our target variable is highly skewed.

### Univariate Analysis

In [None]:
# Numerical columns
for i in df_num:
    fig,ax= plt.subplots(1,3)
    print(i)
    sns.distplot(df_num[i],ax=ax[0],color='Green')
    sns.boxplot(df_num[i],ax=ax[1],palette='Greens')
    sns.violinplot(df_num[i],ax=ax[2],palette='Greens')
    plt.show()

#### Inferences from the univariate analysis of Numerical columns:
* Bedrooms are mostly between 0 to 10, and only one of them has above 30, which is a outlier
* Number of bathrooms is mainly between 0 to 4, and it varies till 8
* sqft_living is right skewed too, with few observations having higher value of sqft_living
* sqft_lot is highly right skewed, with most of the data in the outliers
* Number of floors is mostly between 1 to 2, with a maximum of 3.4
* sqft_above has most of the values between 0 to 4000 and the values lead till 10000
* sqft_basement is right skewed too, with most of the houses with less than 1500 and few houses with more than that
* yr_built as saw from the describe function, is varying from 1900 to 2020
* yr_renovated has only two values, which is 0 and 2000, which means that most of the houses are not renovated and few were renovated in 2000
* lat is varying between 47.1 to 47.8, so we can say that these houses are in a particular region
* long is varying between -122.6 to -121.2, this to confirms that these are houses from closer regions
* sqft_living15 is varying between 0 to 6000 and mainly in the range till 4000, it is also right skewed
* sqft_lot15 is highly right skewed.


In [None]:
# Categorical columns
plt.rcParams['figure.figsize'] = 9,4
for i in df_cat:
    fig,ax= plt.subplots(1,2)
    print(i)
    df_cat[i].value_counts().plot(kind='bar',rot=0,ax=ax[0],cmap='Spectral')
    df_cat[i].value_counts().plot(kind='pie',autopct='%.1f%%',ax=ax[1],cmap='Spectral')
    plt.show()

#### Inferences from the categorical features:
* In waterfront column, most of the values is 0 and only 0.8% of people's apartment was overlooking the waterfront, so it is a good factor
* In view column too, most of the values are 0 and 2 is second highest, so the houses are not having that good of a view
* In condition column, most of the values are 3, followed by 4, so almost all the houses are in good condition, only around 10% of houses are not in good condition
* In grade column, most of the values, around 40% have given grade of 7 followed by 8, and 9. So, these are good houses
* zipcode has around 70 unique values and most of them contribute almost equally.

### Bivariate Analysis

In [None]:
# Numerical with Target variable
i=1
for col in df_num:
    print(col,'Vs price')
    sns.scatterplot(df_num['price'],df_num[col])
    plt.show()

#### Inferences from the bivariate analysis of the numerical features:
* The number of bedrooms doesn't seem to have a linear relationship with the price feature
* The number of bathrooms is having almost linear relationship with the price, with increase in number of bathrooms, the price is increasing too
* The sqft_living feature is having almost perfect linear relationship with price
* The sqft_lot feature doesn't seem to have a relation with price
* The number of floors too is not contributing much to the price feature
* sqft_above is having a linear relationship with price
* sqft_basement is not having much effect on price feature
* yr_built has very little relation with the price feature
* yr_renovated with only two values in it, does not have an effect on price feature
* lat seems to have very little effect on price
* long doesn't seem to have much effect on price
* sqft_living15 is having slightly lineary relationship with price
* sqft_lot15 is not having any relation with price

In [None]:
# categorical with target variable
plt.rcParams['figure.figsize']= 10,4
for col in df_cat:
    fig,ax= plt.subplots(1,2)
    print(col,'Vs price')
    sns.boxplot(df_cat[col],df['price'],ax=ax[0],palette='coolwarm')
    sns.violinplot(df_cat[col],df['price'],ax=ax[1],palette='coolwarm')
    plt.show()

#### Inferences from the bivariate analysis of categorical features:
* For both the values of waterfront the price follows same distribution, but the highest price is where the waterfront value is 0
* For views having values 2 or more is having very good price compared to 0 and 1
* Condition also is having direct relationship with price, houses with condition of 3 or 4 are having very good price and with values 1 and 2 are having very less price
* Grade is having exact direct relationship with price, when the grade is less price is less and when grade is high, price is also very high, for houses with grade 11 and above are having very high price
* zipcode is not distributed uniformly, but few zipcodes are having very high prices, so based on region price is also varying

### Hypothesis testing

#### Normality test for the target variable

In [None]:
import scipy.stats as st
st.shapiro(df['price'])

#### The target variable is skewed, so spearman correlation should be used between numerical features.

In [None]:
plt.rcParams['figure.figsize'] = 15,8
sns.heatmap(df.corr(method='spearman'),annot=True,cmap='cubehelix')
plt.show()

* The features, id, sqft_lot, sqft_lot15,long have very less correlation with the target variable.
* Similarly yr_renovated also does not have any relation with price as observed from the bivariate analysis too, since it has only 2 values. These 5 variables should be dropped.

In [None]:
df = df.drop(['id','sqft_lot','sqft_lot15','yr_renovated','long'],axis=1)

#### For categorical features with a numerical feature which is skewed, we should use kruskal test

In [None]:
for col in df_cat:
    print(col,'Vs price')
    print(st.kruskal(df['price'],df_cat[col]))
    print('\n')

##### So, all the categorical features seem to have an effect on Price column.

### Transformation and Encoding

In [None]:
X = df.drop('price',axis=1)
y = df['price']

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
for i in X.select_dtypes(np.number):
    X[i] = pt.fit_transform(X[[i]])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in X.select_dtypes('object'):
    X[i] = le.fit_transform(X[[i]])

### Model building

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,y,train_size=0.7,random_state=42)

In [None]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in xtrain.columns:
    xtrain[i] = sc.fit_transform(xtrain[[i]])
for i in xtest.columns:
    xtest[i] = sc.fit_transform(xtest[[i]])

In [None]:
from sklearn.metrics import adjusted_rand_score, r2_score, mean_squared_error, mean_absolute_error

In [None]:
import statsmodels.api as sm
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
model.summary()

In [None]:
xtrain = xtrain.drop(['sqft_above','sqft_basement'],axis=1)
xtest = xtest.drop(['sqft_above','sqft_basement'],axis=1)

In [None]:
# Linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr = lr.fit(xtrain,ytrain)
ypred = lr.predict(xtest)
print('Training r2_Score',lr.score(xtrain,ytrain))
print('Testing r2_Score ',lr.score(xtest,ytest))

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=10,min_samples_split=15,max_leaf_nodes=50, random_state=42)
dt = dt.fit(xtrain,ytrain)
ypred_dt = lr.predict(xtest)
print('Training r2_score', dt.score(xtrain,ytrain))
print('Testing r2_score', dt.score(xtest,ytest))

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=150,max_depth=7)
rf = rf.fit(xtrain,ytrain)
ypred_rf = rf.predict(xtest)
print('Training r2_score',rf.score(xtrain,ytrain))
print('Testing r2_score',rf.score(xtest,ytest))

In [None]:
# AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor
adb = AdaBoostRegressor(n_estimators=10,random_state=42)
adb = adb.fit(xtrain,ytrain)
ypred_adb = adb.predict(xtest)
print('Training r2_score', adb.score(xtrain,ytrain))
print('Testing r2_score',adb.score(xtest,ytest))

In [None]:
# Gradient Boost Regressor
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=42)
gb = gb.fit(xtrain,ytrain)
ypred_gb = gb.predict(xtest)
print('Training r2_score',gb.score(xtrain,ytrain))
print('Testing r2_score',gb.score(xtest,ytest))

In [None]:
# XGBoost Regressor
import xgboost
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=8,random_state=42)
xgb = xgb.fit(xtrain,ytrain)
ypred_xgb = xgb.predict(xtest)
print('Training r2_score',xgb.score(xtrain,ytrain))
print('Testing r2_score',xgb.score(xtest,ytest))

#### Gradient Boost Regressor is giving the highest rsquare value,so it is the best model.

In [None]:
important_features = pd.DataFrame({'Features': xtrain.columns, 
                                   'Importance': gb.feature_importances_})

# print the dataframe
important_features.sort_values(by='Importance', ascending=False, inplace=True)
important_features

In [None]:
plt.rcParams['figure.figsize'] = 8,5
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)
plt.show()

#### Conclusion (Things to be considered mainly for knowing the house price):
* The sqft_living is influencing the price feature mostly, so the price of a house is highly determined by the sqft of the living room
* It is followed by grade, as we saw from bivariate analysis, it is having a direct and strong relationship with price
* Lat is also influencing price, so it depends upon the area to determine the price
* yr_built is having good effect on price, even though it is not having a linear relationship with price, it is influencing price too
* zipcode, same as latitude is having some influence
* waterfront and view are also having effect on price, which is observed from bivariate too
* The number of bathrooms is having little effect on price as we saw from bivariate, but number of bedrooms is not having that much effect.