B-spline Regression

Importing Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from sklearn.model_selection import train_test_split


Importing dataset

In [None]:
dataset = pd.read_csv("../input/boston-housing-dataset/HousingData.csv")
print(dataset.shape)

In [None]:
print(dataset)

In [None]:
print(dataset.isnull().sum())

In [None]:
dataset.rename(columns={"MEDV" : "PRICE"},inplace=True)
print(dataset.head())

In [None]:
print(dataset.info())

Null Values Heatmap

In [None]:
sns.heatmap(dataset.isnull(),yticklabels=False,cbar=False)

In [None]:
X = dataset.iloc[:,0:13]
y = dataset.iloc[:,-1]

Replacing missing values with mean

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X.iloc[:, 0:13])
X.iloc[:, 0:13] = imputer.transform(X.iloc[:, 0:13])

Here all the missing value are replace with mean value of the column as can be seen in the heatmap below

In [None]:
sns.heatmap(X.isnull(),yticklabels=False,cbar=False)

In [None]:
print(X.isnull().sum())

Analysis of the data

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.hist(y,bins= 30)
plt.xlabel("Price of house x($1000)")
plt.ylabel("Number of Houses")
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.scatter(X['LSTAT'],y,marker='o')
plt.xlabel("LSTAT")
plt.ylabel("Prices of house in thousands")
plt.plot()

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.scatter(X.RM,y,marker='o')
plt.xlabel('RM')
plt.ylabel("Prices of house in thousands")
plt.plot()

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.hist(X.AGE,bins= 30)
plt.xlabel("AGE")
plt.ylabel("Number of Houses")
plt.plot()

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.hist(X.TAX,bins= 30)
plt.title("Full-value property-tax rate per $10,000")
plt.xlabel("TAX")
plt.ylabel("Number of Houses")
plt.plot()

Splitting training and test data

In [None]:
data_X = X.RM
X_train,X_test,y_train,y_test= train_test_split(data_X,y,test_size=0.20, random_state = 0)

In [None]:
from patsy import dmatrix
import statsmodels.formula.api as smf

Fitting the model with B-Spline with 5 uniform knots and equation of degree 3

In [None]:
X_spline = dmatrix('bs(x,df = 5, degree = 3, include_intercept = False)', {'x': X_train},return_type = 'dataframe')
spline_fit = sm.GLM(y_train,X_spline).fit()

Getting Training set error

In [None]:
y_pred_train = spline_fit.predict(dmatrix('bs(test, df = 5, degree = 3, include_intercept = False)', {'test': X_train},return_type = 'dataframe'))
rmse_train = np.sqrt(mean_squared_error(y_train,y_pred_train))
print(rmse_train)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_train,y_pred_train)

Getting Test set error

In [None]:
y_pred = spline_fit.predict(dmatrix('bs(test, df = 5, degree = 3, include_intercept = False)', {'test': X_test},return_type = 'dataframe'))
rmse_test = np.sqrt(mean_squared_error(y_test,y_pred))
print(rmse_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

Comparison of Real And Predicted prices of Training set.

In [None]:
comparison = pd.concat([X_train,y_train,y_pred_train],axis=1)
comparison.columns = ['Average no. of Rooms','Real Prices($1000)','Predicted Prices($1000)']
print(comparison)

Comparison of  Real and Predicted Prices of Test Set.

In [None]:
comparison = pd.concat([X_test,y_test,y_pred],axis=1)
comparison.columns = ['Average no. of Rooms','Real Prices($1000)','Predicted Prices($1000)']
print(comparison)

Creating linspaces to make predictions and plot graph.

In [None]:
range_pred = np.linspace(np.min(X_train),np.max(X_train),50)
prediction = spline_fit.predict(dmatrix('bs(xp, df = 5, degree = 3,include_intercept = False)', {'xp': range_pred},return_type = 'dataframe'))

Plotting Graph to see how our model fit to the data.

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.scatter(X.RM,y,marker='o')
plt.plot(range_pred, prediction, color='r', label='Specifying degree =3 with 5 knots')
plt.xlabel('RM')
plt.ylabel("Prices of house in thousands")
plt.plot()