In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# Loading the data:
house = pd.read_csv("../input/hou_all.csv", header=None, sep=',')
house.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV','BIAS_COL']
# Pair-wise scatter-plot of all the attributes:
sns.set(style='whitegrid', context='notebook')
cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV','BIAS_COL']
sns.pairplot(house[cols].dropna())
plt.show()
print("I had to make it smaller in here!")

In [None]:
# Making a heatmap of all pair-wise correlation coefficients:
cols2 = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
corr_matrix = np.corrcoef(house[cols2].values.T)
sns.set(font_scale=0.8)
heatmap = sns.heatmap(corr_matrix, cbar=True, annot=True, square=True, fmt='.2f',
	annot_kws={'size':15}, yticklabels=cols2, xticklabels=cols2)
fig = plt.gcf()
fig.set_size_inches(12, 12)
fig.savefig('test2png.png', dpi=100)
fig.set_size_inches(12, 12, forward=True)
plt.show()

In [None]:
'''
From the pair-wise scatter-plots we see that there's a linear relationship between RM and MEDV.
From the heat-map we see that RM and MEDV are fairly correlated as well.
So we choose RM to be trained.
'''
cols_x = ['RM', 'BIAS_COL']
N = 506
X = np.array(house[cols_x], dtype='float')
Y = np.array(house['MEDV'], dtype='float')
# Make a function to fit:
def fit(X,Y):
	return np.linalg.solve(X.T.dot(X),X.T.dot(Y))
# Make a function to get the R^2:
def get_r2(actual,hat):
	d1 = actual-hat
	d2 = actual-actual.mean()
	r2 = 1 - d1.dot(d1)/d2.dot(d2)
	return r2
# Making the training set:
train_indexes = np.random.choice(N, 350)
Xtrain = X[train_indexes]
Ytrain = Y[train_indexes]
# Making the test set:
test_indexes = [index for index in range(N) if index not in train_indexes]
Xtest = X[test_indexes]
Ytest = Y[test_indexes]
# Fitting the training set:
w = fit(Xtrain,Ytrain)
Ytrain_hat = Xtrain.dot(w)
train_r2 = get_r2(Ytrain,Ytrain_hat)
# Fitting the test set:
Ytest_hat = Xtest.dot(w)
test_r2 = get_r2(Ytest,Ytest_hat)

In [None]:
# Displaying the training and test sets with their accuracies:
plt.scatter(Ytrain_hat,Ytrain_hat-Ytrain,label='Training data')
plt.scatter(Ytest_hat,Ytest_hat-Ytest,color='lightgreen',label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.show()
print("R sqaured of the training set is: ",train_r2)
print ("R sqaured of the test set is: ",test_r2)

In [None]:
# Doing an ordinary linear regression:
X_O = np.array(house['RM'], dtype='float')
denominator = X_O.dot(X_O) - X_O.mean() * X_O.sum()
a = ( X_O.dot(Y) - Y.mean()*X_O.sum() ) / denominator
b = ( Y.mean() * X_O.dot(X_O) - X_O.mean() * X_O.dot(Y) ) / denominator
Yhat = a*X_O + b
r2 = get_r2(X_O,Y)
plt.scatter(X_O, Y)
plt.plot(X_O, Yhat,color='r')
plt.show()
# Estimating the coefficients of the regression model:
print("Slope: ",a)
print("Intercept: ",b)

In [None]:
# Now using all the variables to improve the prediction accuracy:
cols_x_all = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT','BIAS_COL']
N = 506
X_all = np.array(house[cols_x_all], dtype='float')
Y = np.array(house['MEDV'], dtype='float')
# Making the training set:
train_indexes = np.random.choice(N, 350)
Xtrain = X_all[train_indexes]
Ytrain = Y[train_indexes]
# Making the test set:
test_indexes = [index for index in range(N) if index not in train_indexes]
Xtest = X_all[test_indexes]
Ytest = Y[test_indexes]
# Fitting the training set:
w = fit(Xtrain,Ytrain)
Ytrain_hat = Xtrain.dot(w)
train_r2 = get_r2(Ytrain,Ytrain_hat)
# Fitting the test set:
Ytest_hat = Xtest.dot(w)
test_r2 = get_r2(Ytest,Ytest_hat)
# Displaying the training and test sets with their accuracies:
plt.scatter(Ytrain_hat,Ytrain_hat-Ytrain,label='Training data')
plt.scatter(Ytest_hat,Ytest_hat-Ytest,color='lightgreen',label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.show()
print("R sqaured of the training set is: ",train_r2)
print("R sqaured of the test set is: ",test_r2)