### House Prices Model Fitting
 I performed EDA on the same data set, for which the link is as follows:   
 https://www.kaggle.com/tanyachawla412/eda-of-house-prices   
 I then tried to fit regression models by tweaking the parameters and appling diffrent regularisation techniques.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn import metrics
from scipy import stats

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#importing the dataset without outliers to fit multiple linear Regression
dataset = pd.read_csv('../input/house-prices/House_price.csv')

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop(labels = 'Address', axis = 1)

In [None]:
z = np.abs(stats.zscore(dataset))
dataset = dataset[(z < 3).all(axis=1)]
dataset.head()

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
# trying to fit multiple linear regression onto the given dataset
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis = 1))

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

In [None]:
print("R-Square Value",r2_score(y_test,y_pred))
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
print("Train Accuracy:",regressor.score(X_train, y_train))

In [None]:
y_pred = y_pred.reshape(len(y_pred),1)
y_test = y_test.reshape(len(y_test),1)
residual = y_test - y_pred
# to check distribution of residual values
plt.hist(residual, bins = 50)
plt.title('Frequency distribution of Residual Values')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# to check Homoscedasticity assumption of linear regression
plt.scatter(y_pred, residual)
plt.title('Predicted value vs Residual Value')
plt.ylabel('Predicted Value')
plt.xlabel('Residual Value')
plt.show()

In [None]:
#from EDA we observed the the No of Bedrooms and the No of Rooms were has a correlation coefficient of 0.46
#so we trying fitting multiple linear regression by dropping No of Bedrooms
dataset1 = dataset.drop(labels = 'Number of Bedrooms', axis = 1)

In [None]:
dataset1.head()

In [None]:
X = dataset1.iloc[:, :-1].values
y = dataset1.iloc[:, -1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)

In [None]:
y_pred = regressor1.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis = 1))

In [None]:
print(regressor1.coef_)
print(regressor1.intercept_)

In [None]:
print("R-Square Value",r2_score(y_test,y_pred))
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
print("Train Accuracy:",regressor1.score(X_train, y_train))

In [None]:
y_pred = y_pred.reshape(len(y_pred),1)
y_test = y_test.reshape(len(y_test),1)
residual = y_test - y_pred
# to check distribution of residual values
plt.hist(residual, bins = 50)
plt.title('Frequency distribution of Residual Values')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# to check Homoscedasticity assumption of linear regression
plt.scatter(y_pred, residual)
plt.title('Predicted value vs Residual Value')
plt.ylabel('Predicted Value')
plt.xlabel('Residual Value')
plt.show()

In [None]:
# Fitting the data with Regularisation Techniques
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=0.01, normalize=True)
ridgereg.fit(X_train, y_train)

In [None]:
y_pred = ridgereg.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis = 1))

In [None]:
print("Coeffients:", ridgereg.coef_)
print("Intercept:", ridgereg.intercept_)

In [None]:
print("R-Square Value : ",r2_score(y_test,y_pred))
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
print("Train Accuracy:",ridgereg.score(X_train, y_train))

In [None]:
y_pred = y_pred.reshape(len(y_pred),1)
y_test = y_test.reshape(len(y_test),1)
residual = y_test - y_pred
# to check distribution of residual values
plt.hist(residual, bins = 50)
plt.title('Frequency distribution of Residual Values')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# to check Homoscedasticity assumption of linear regression
plt.scatter(y_pred, residual)
plt.title('Predicted value vs Residual Value')
plt.ylabel('Predicted Value')
plt.xlabel('Residual Value')
plt.show()

In [None]:
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.1, normalize=True)
lassoreg.fit(X_train, y_train)

In [None]:
y_pred = lassoreg.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis = 1))

In [None]:
print("Coeffients:", lassoreg.coef_)
print("Intercept:", lassoreg.intercept_)

In [None]:
print("R-Square Value : ",r2_score(y_test,y_pred))
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
print("Train Accuracy:",lassoreg.score(X_train, y_train))

In [None]:
y_pred = y_pred.reshape(len(y_pred),1)
y_test = y_test.reshape(len(y_test),1)
residual = y_test - y_pred
# to check distribution of residual values
plt.hist(residual, bins = 50)
plt.title('Frequency distribution of Residual Values')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# to check Homoscedasticity assumption of linear regression
plt.scatter(y_pred, residual)
plt.title('Predicted value vs Residual Value')
plt.ylabel('Predicted Value')
plt.xlabel('Residual Value')
plt.show()