** Predicting Housing price through Linear Regression and Visualizing Data.**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv("../input/kc_house_data.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.date = pd.to_datetime(df['date'])
df.yr_built = pd.to_datetime(df.yr_built)
df.yr_renovated = pd.to_datetime(df.yr_renovated)

In [None]:
df.info()

**Till now what we have done is --- We have inspected for the datatypes and converted them to the respective form i.e. Data Time series**

**Now let us expole more --- lets check if there is any missing Data in our Data Frame**

In [None]:
df = df.drop_duplicates()
df.isnull().sum()

******As there are no missing values lets move further to the next steps** i.e to check the correlation before that lets check our Target Variable -- price****

In [None]:
df.price.skew()

**This is too much of skewness this may alter our results so, let us fix it**

In [None]:
df.price.plot(kind='hist')

In [None]:
np.log(df.price).skew()

In [None]:
np.log(df.price).plot(kind = 'hist')

In [None]:
df.price = np.log(df.price)

**Now we have converted the price target variable into log form where the Data is Normally Distributed with much less skewness**

**Let us check the correlation factors and features which contribute or bias our result**

In [None]:
corr = df.corr()

In [None]:
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

***now what we are going to do is -- we will drop the variables with correlation > 95%***

In [None]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
df = df.drop(df.columns[to_drop], axis=1)

**Let us see the first 5 and last 5 variables which correspond to high +ve or -ve correlation**

In [None]:
print (corr['price'].sort_values(ascending=False)[:5], '\n')
print (corr['price'].sort_values(ascending=False)[-5:])

let us start with **grade**

In [None]:
df.grade.unique()

In [None]:
grade_pivot = df.pivot_table(index='grade',
                                  values='price', aggfunc=np.median)

In [None]:
grade_pivot

In [None]:
grade_pivot.plot(kind='bar', color='blue')
plt.xlabel('Grade')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)

x = df.sqft_living
y = df.price

z = np.polyfit(x,y,1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

**We can clearly see as the grade increases Median Sale Price also increase not drastically but surely with a decent slope**

In [None]:
df.grade.plot(kind = 'hist')
plt.show()

In [None]:
df.grade.skew()

**Now lets targer sqft living**

In [None]:
df.sqft_living.skew()

In [None]:
df.sqft_living = np.log(df.sqft_living)
df.sqft_living.plot(kind='hist',color = 'blue')

In [None]:
df.sqft_living.skew()

In [None]:
plt.scatter(x=df.sqft_living,y=df.price)
plt.xlabel('Sqft_Living')
plt.ylabel('Sale_Price')
plt.xticks(rotation=0)

x = df.sqft_living
y = df.price

z = np.polyfit(x,y,1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

**Perfect increase which is justified**

In [None]:
df.sqft_living15.skew()

In [None]:
np.log(df.sqft_living15).skew()

In [None]:
df.sqft_living15 = np.log(df.sqft_living15)

df.sqft_living15.plot(kind='hist')

In [None]:
plt.scatter(x=df.sqft_living15,y=df.price)
plt.xlabel('Sqft_Living15')
plt.ylabel('Sale_Price')
plt.xticks(rotation=0)

x = df.sqft_living15
y = df.price

z = np.polyfit(x,y,1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

**Once again we can see a definiet relationship between Sale Price and Sqft  Living15**

**Next on the list is  Sqft Above**

In [None]:
df.sqft_above.skew()

In [None]:
np.log(df.sqft_above).skew()

In [None]:
df.sqft_above = np.log(df.sqft_above)

In [None]:
df.sqft_above.plot(kind='hist')

In [None]:
plt.scatter(x=df.sqft_above,y=df.price)
plt.xlabel('Sqft Above')
plt.ylabel('Sale_Price')
plt.xticks(rotation=0)

x = df.sqft_above
y = df.price

z = np.polyfit(x,y,1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

**This looks pretty decent**

**Let us check for correlations again**

In [None]:
print (corr['price'].sort_values(ascending=False)[:7], '\n')
print (corr['price'].sort_values(ascending=False)[-5:])

We have pretty much covered most of the things

In [None]:
df['age_house'] = df.date - df.yr_built
df.age_house = pd.to_numeric(df.age_house.dt.days/365)
df.age_house.head()

In [None]:
df = df.drop(['date','yr_built'],axis = 'columns')

In [None]:
df = df[df.bedrooms > 1]

In [None]:
df.yr_renovated = pd.to_numeric(df.yr_renovated)

In [None]:
c = df.yr_renovated != 0

In [None]:
c = c.map({False:0, True:1})
df.yr_renovated = c

In [None]:
df.info()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
y = df.price
X = df.drop(['price', 'id'], axis=1)

In [None]:
cv = cross_val_score(reg,X,y,cv = 5)

In [None]:
np.mean(cv)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 42,test_size = 0.3)

In [None]:
reg.fit(X_train,y_train)

In [None]:
result = reg.score(X_train,y_train)
print("Accuracy: %.3f%%" % (result*100.0))

In [None]:
import statsmodels.api as sm
X1 = sm.add_constant(X_train)
result = sm.OLS(y_train, X1).fit()
#print dir(result)
print('The R2 and Adjusted R2 are : {0} %, {1} % ;respectively'.format(result.rsquared*100, result.rsquared_adj*100))

In [None]:
y_pred = reg.predict(X_test)

In [None]:
from sklearn import metrics

mae = metrics.mean_absolute_error(y_test,y_pred)
print("Mean Absolute Error is: ", mae)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Square Error is: ", rms)

In [None]:
mape = np.mean(metrics.mean_absolute_error(y_test,y_pred)/y_test *100)

In [None]:
print('So the Mean Absolute Percentage Error is: {0} %'.format(mape))

In [None]:
#np.mean(np.abs((y_test - y_pred) / y_test)) * 100

> ***Our Model is off by 1.51%***

In [None]:
actual_values = y_test
predictions = y_pred
sns.scatterplot(actual_values, predictions)

**For more deeper analysis we can go with the following**

In [None]:
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

**Let's plot residulas **

In [None]:
# Plot residuals
plt.scatter(y_train_pred, y_train_pred - y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 11.5, xmax = 15.5, color = "red")
plt.show()

**Now lets plot prediction **

In [None]:
# Plot predictions
plt.scatter(y_train_pred, y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([11.5, 15.5], [11.5, 15.5], c = "red")
plt.show()

**So finally we have achieved! Peace :)**