In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn import metrics
import scipy.stats as stats
import pylab
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True

In [None]:
df=pd.read_csv("../input/housesalesprediction/kc_house_data.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['date'] = df.date.str.strip('T000000')
df['date'] = pd.to_datetime(df.date , format='%Y%m%d')

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
sns.distplot(df.price)

In [None]:
df['log_price'] = np.log(df.price)

In [None]:
sns.distplot(df.log_price)

In [None]:
corr = df.corr()
corr.style.background_gradient()

In [None]:
plt.subplots(figsize=(17,14))
sns.heatmap(df.corr(),annot=True,linewidths=0.5,fmt="1.1f")
plt.title("Data Correlation",fontsize=50)
plt.show()

In [None]:
# Drop variables based on low correlation
df=df.drop(['id','condition','yr_built','yr_renovated','zipcode','long','date'],axis=1)

In [None]:
df.head()

In [None]:
feature_columns=df.columns.difference(['price','log_price'])
feature_columns

In [None]:
train, test= train_test_split(df,test_size=0.3,random_state=12345)

In [None]:
print('train data :: ',train.shape)
print('test data :: ',test.shape)

In [None]:
lm=smf.ols('log_price ~ bathrooms + bedrooms + floors + grade + lat + sqft_above + sqft_basement + sqft_living + sqft_living15 + sqft_lot + sqft_lot15 + view + waterfront',train).fit()
lm.summary()

In [None]:
train['pred_price'] = np.exp(lm.predict(train))
train['error'] = train['price'] - train['pred_price']
train.head()

In [None]:
test['pred_price'] = np.exp(lm.predict(test))
test['error'] = test['price'] - test['pred_price']
test.head()

In [None]:
# Accuracy metrices
MAPE_train = np.mean(np.abs(train.error) / train.price) * 100
MAPE_test = np.mean(np.abs(test.error) / test.price) * 100
print(MAPE_train)
print(MAPE_test)

In [None]:
lm.resid.hist(bins=10)

In [None]:
lm.resid.mean()

In [None]:
sns.distplot(lm.resid)

In [None]:
sns.distplot(test.error)

In [None]:
sns.jointplot(train.price,train.error)

In [None]:
stats.probplot(train.error,dist='norm',plot=pylab)
pylab.show()