In [48]:
#importing necessary dependencies (step 1)
import pandas as pd
import numpy as np
from sklearn import linear_model #sklearn is a library of ml and linear is a model
from sklearn.model_selection import train_test_split #(t_t_s) is a function to split the data
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
# (step 2) load the boston data set from sklearn.datasets
from sklearn.datasets import load_boston
boston = load_boston() # boston is variable and load_boston is a dictionary obj
print(boston) # to our convenience we transform the data into data frame using panda libarary

In [13]:
# dataset to dataframe (step 3)
# independent variables and dependent variables (like data we want independent x and feature names =column names of the data)
# target= target variable or price of the boston house or also known as y variable
df_x = pd.DataFrame(boston.data, columns = boston.feature_names)
df_y = pd.DataFrame(boston.target)


In [None]:

#df_x.head()
#df_x.shape
#df_x.columns
#df_y.head()
#df_x.dtypes
#df_x.nunique()
# Check for missing values
#data.isnull().sum()
# See rows with missing values
#data[data.isnull().any(axis=1)]

In [None]:
# from our dataset we can get mean,count,max,min of each column and lot more basically like useful for boxplotting,outliers
df_x.describe()

In [None]:
# Finding out the correlation between the features
corr = df_x.corr()
corr.shape

In [None]:
# Plotting the heatmap of correlation between features
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')

In [36]:
# linear regression model bcz for numeric value we use this model often (step 4)
regm = linear_model.LinearRegression()


In [37]:
#training,testing,splitting of the data (step 5)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=4) #30% testing & 70% training


In [None]:
regm.fit(x_train,y_train) #training

In [None]:
print(regm.coef_) #coefficients of our linear reg model

In [None]:
# Value of y intercept
regm.intercept_


In [46]:
# Model prediction on train data
y_pred = regm.predict(x_train)

In [None]:
# Model Evaluation
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-x_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
# Checking residuals
plt.scatter(y_pred,y_train-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()

In [None]:
# Checking Normality of errors
sns.distplot(y_train-y_pred)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
# predictions on our test data
y_test_pred=regm.predict(x_test)
print(y_test_pred)

In [None]:
# Model Evaluation
acc_linreg = metrics.r2_score(y_test, y_test_pred)
print('R^2:', acc_linreg)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

In [None]:
#printing the actual values
#print(y_test)  #for 3rd row machine predicted y_pred[2]=14.45 and actual one was y_test[2]=13.6 a little close

In [None]:
#checking the model performance/accuracy using mean squared error
#print(np.mean(y_pred-y_test)**2)

In [None]:
# another method to check accuracy using sklearn
#from sklearn.metrics import mean_squared_error
#print(mean_squared_error(y_test, y_pred))