### Importing and Understanding Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Importing Housing.csv
housing = pd.read_csv('/kaggle/input/housing/newhousing.csv')

In [None]:
# Looking at the first five rows
housing.head()

In [None]:
housing.shape

In [None]:
# What type of values are stored in the columns?
housing.info()

## Splitting Data into Training and Testing Sets

In [None]:
housing.columns

In [None]:
# Putting feature variable to X
X = housing[['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished',
       'areaperbedroom', 'bbratio']]

# Putting response variable to y
y = housing['price']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(y)
plt.show()

In [None]:
y.skew()

In [None]:
from scipy import stats
y=stats.boxcox(y)[0]

In [None]:
sns.distplot(y)
plt.show()

In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Let's see the correlation matrix 
plt.figure(figsize = (16,10))     # Size of the figure
sns.heatmap(X.corr(),annot = True)
plt.show()

In [None]:
#creating correlation matrix for the given data
corrmat = np.corrcoef(X.transpose())

In [None]:
#Make a diagonal matrix with diagonal entry of Matrix corrmat
p=np.diagflat(corrmat.diagonal())

In [None]:
# subtract diagonal entries making all diagonals 0
corrmat_diag_zero = corrmat - p
print("max corr:",corrmat_diag_zero.max(), ", min corr: ", corrmat_diag_zero.min(),)


In [None]:
# Retrieve the (i,j) index for which matrix has maximum value
ij_max = np.unravel_index(corrmat_diag_zero.argmax(), corrmat_diag_zero.shape)
print("ij_max",ij_max)
print("Maximum correlation :",corrmat_diag_zero[ij_max])

In [None]:
# Retrieve the (i,j) index for which matrix has absolute minimum value
ij_min = np.unravel_index(np.absolute(corrmat).argmin(), corrmat.shape)
print("ij_min",ij_min)
print("Minimum correlation :",corrmat_diag_zero[ij_min])

In [None]:
#random_state is the seed used by the random number generator, it can be any integer.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

from sklearn.preprocessing import PowerTransformer


In [None]:
X_train.shape
# We have 15 variables after splitting the data

In [None]:
scaler = PowerTransformer()
Xtrain=scaler.fit_transform(X_train) 
Xtest=scaler.transform(X_test) 

In [None]:
Xtrain.shape

In [None]:
xtrain_df = pd.DataFrame(Xtrain,columns=X_train.columns)
Xtrain[:,0].max()
xtrain_df['area'].max()

In [None]:
y_train.shape

In [None]:
xtest_df = pd.DataFrame(Xtest,columns=X_train.columns)
xtest_df['guestroom'].min()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import  linear_model
from sklearn.metrics import mean_squared_error, r2_score
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(Xtrain, y_train)
# Make predictions using the testing set
y_pred = regr.predict(Xtest)

In [None]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))


In [None]:
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))

In [None]:
from sklearn.metrics import explained_variance_score
explained_variance_score(y_test, y_pred) 

In [None]:
#Importing the PCA module
from sklearn.decomposition import PCA
pca = PCA( random_state=42)

In [None]:
#Doing the PCA on the train data
pca.fit(Xtrain)

In [None]:
components = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1],'Feature':X.columns })
components

In [None]:
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1],'PC3':pca.components_[2],
                       'PC4':pca.components_[3],'PC5':pca.components_[4],
                       'PC6':pca.components_[5],'PC7':pca.components_[6],'PC8':pca.components_[7],
                       'PC9':pca.components_[8],'PC10':pca.components_[9],'PC11':pca.components_[10],
                       'PC12':pca.components_[11],'PC13':pca.components_[12],
                       'PC14':pca.components_[13],'PC15':pca.components_[14],'Feature':colnames})

In [None]:
pcs_df

In [None]:
pca.explained_variance_

In [None]:
print("pca.explained_variance_ratio_: ",pca.explained_variance_ratio_.round(3)*100)

In [None]:
print (pca.explained_variance_ratio_.cumsum())

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.vlines(x=10, ymax=1, ymin=0, colors="r", linestyles="--")
plt.hlines(y=.946, xmax=15, xmin=0, colors="g", linestyles="--")
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
product = np.dot(pca.components_[0],pca.components_[1])
product.round(5)

In [None]:
%matplotlib inline
fig = plt.figure(figsize = (20,12))
plt.scatter(components.PC1, components.PC2)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
for i, txt in enumerate(components.Feature):
    plt.annotate(txt, (components.PC1[i],components.PC2[i]))
plt.tight_layout()
plt.show()

In [None]:
pca_train = pca.transform(Xtrain)
pca_train.shape

In [None]:
pca_train

In [None]:
#creating correlation matrix for the principal components
corrmat = np.corrcoef(pca_train.transpose())
corrmat

In [None]:
#plotting the correlation matrix
%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(corrmat,annot = True)
plt.show()

In [None]:
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0

In [None]:
#Applying selected components to the test data - 13 components
pca_test = pca.transform(Xtest)
pca_test.shape

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import  linear_model
from sklearn.metrics import mean_squared_error, r2_score
# Create linear regression object
regrpca = linear_model.LinearRegression()
# Train the model using the principal components of the transformed training sets
regrpca.fit(pca_train, y_train)
# Make predictions using the principal components of the transformed testing set
y_pca_pred = regrpca.predict(pca_test)


In [None]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pca_pred))


In [None]:
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pca_pred))

In [None]:
pca = PCA(n_components=10,random_state=42)
#Scale and transform data to get Principal Components

Xtrain_reduced = pca.fit_transform(Xtrain)
Xtest_reduced = pca.transform(Xtest)
regrpca6 = linear_model.LinearRegression()
# Train the model using the principal components of the transformed training sets
regrpca6.fit(Xtrain_reduced, y_train)
# Make predictions using the principal components of the transformed testing set
y_pred = regrpca6.predict(Xtest_reduced)



In [None]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))


In [None]:
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))