In [None]:
#importing pckages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
import statsmodels.api as sm  
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
import os
print(os.listdir("../input/housesalesprediction"))

# Data Loading, Processing and Visualization

In [None]:
#loading data
df = pd.read_csv("..//input//housesalesprediction//kc_house_data.csv")

In [None]:
#Get the information about data
df.info()

In [None]:
#Find out the statstics about data
df.describe()

In [None]:
#Get the sample view from top
df.head()

In [None]:
#Get the sample view from bottom
df.tail()

In [None]:
# Looking Missing Values
print(df.isnull().any())

In [None]:
#Data is pretty clean. Now dropping two columns which are going to used in prediction. Date and ID
df = df.drop(['id', 'date'],axis=1)

In [None]:
# Let's visualize the data 
#sns.pairplot(df)

In [None]:
sns.pairplot(df[['sqft_lot','sqft_above','price','sqft_living','bedrooms']], palette='afmhot',height=1.6)

In [None]:
#Rescaling the features
#defining a normalisation function 
def normalize (x): 
    return ( (x-np.mean(x))/ (max(x) - min(x)))
                                            
                                              
# applying normalize ( ) to all columns 
df = df.apply(normalize)

# Splitting Data , train and test sets

In [None]:
df.columns

In [None]:
# Putting feature variable to X
X=df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15']]
# Putting feature variable to y
y=df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7 ,test_size = 0.3, random_state=100)

# Running RFE

In [None]:
# Running RFE with the output number of the variable 
lm = LinearRegression()
rfe = RFE(lm, 9)             # running RFE
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)           # Printing the boolean results
print(rfe.ranking_)  

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
col

# Building Model

In [None]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]

In [None]:
X_train_rfe.columns

In [None]:
# Adding a constant variable 
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [None]:
#Let's see the summary of our linear model
print(lm.summary())

In [None]:
vif=pd.DataFrame()

In [None]:
vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]

In [None]:
 vif["variables"] = X_train_rfe.columns

In [None]:
vif

# Dropping the Variable and Updating the Model

In [None]:
# Dropping highly correlated variables and insignificant variables
X_train_new = X_train_rfe.drop('sqft_living', 1)

In [None]:
# Create a second fitted model
lm_2 = sm.OLS(y_train,X_train_new).fit()

In [None]:
#Let's see the summary of our second linear model
print(lm_2.summary())

In [None]:
vif=pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif["variables"] = X_train_new.columns
vif

In [None]:
# Now we got all the VIF under 5 and p value of existing variables is 0.000
# R-squared is Approx 69 %  there is no very minute differenece between R-squared and Adjusted R-Squraed. 
# We can say these are the important features which are more useful to predict the House Price.

# Prediction using Model 2

In [None]:
# Adding  constant variable to test dataframe
X_test_m2 = sm.add_constant(X_test)

In [None]:
# Creating X_test_m6 dataframe by dropping variables from X_test_m6
X_test_m2 = X_test_m2.drop(['sqft_lot', 'floors','view', 'condition', 'yr_renovated', 'zipcode', 'long','sqft_living15','sqft_lot15','sqft_living'], axis=1)

In [None]:
# Making predictions
y_pred_m2 = lm_2.predict(X_test_m2)

# Model Evaluation

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred_m2)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
# Plotting the error terms to understand the distribution.
fig = plt.figure()
sns.distplot((y_test-y_pred_m2),bins=50)
fig.suptitle('Error Terms', fontsize=20)                  # Plot heading 
plt.xlabel('Residual', fontsize=18)                  # X-label
plt.ylabel('Index', fontsize=16)   

In [None]:
from sklearn import metrics
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test, y_pred_m2)))

In [None]:
#Lower values of RMSE indicate better fit