### Importing and Understanding Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Importing Housing.csv
housing = pd.read_csv('/kaggle/input/housing/newhousing.csv')

In [None]:
# Looking at the first five rows
housing.head()

In [None]:
housing.shape

In [None]:
# What type of values are stored in the columns?
housing.info()

## Splitting Data into Training and Testing Sets

In [None]:
housing.columns

In [None]:
# Putting feature variable to X
X = housing[['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished',
       'areaperbedroom', 'bbratio']]

# Putting response variable to y
y = housing['price']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(y)
plt.show()

In [None]:
y.skew()

In [None]:
from scipy import stats
y=stats.boxcox(y)[0]

In [None]:
sns.distplot(y)
plt.show()

In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Let's see the correlation matrix 
plt.figure(figsize = (16,10))     # Size of the figure
sns.heatmap(X.corr(),annot = True)
plt.show()

In [None]:
#creating correlation matrix for the given data
corrmat = np.corrcoef(X.transpose())
corrmat

In [None]:
#Make a diagonal matrix with diagonal entry of Matrix corrmat
p=np.diagflat(corrmat.diagonal())
p

In [None]:
# subtract diagonal entries making all diagonals 0
corrmat_diag_zero = corrmat - p
print("max corr:",corrmat_diag_zero.max(), ", min corr: ", corrmat_diag_zero.min(),)


In [None]:
import matplotlib.pyplot as plt
from sklearn import  linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import explained_variance_score
#Importing the PCA module
from sklearn.decomposition import PCA

In [None]:
#random_state is the seed used by the random number generator, it can be any integer.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7 ,test_size = 0.3, random_state=42)

from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer()
Xtrain=scaler.fit_transform(X_train) 
Xtest=scaler.transform(X_test) 



In [None]:
pca = PCA(random_state=42)
pca.fit(Xtrain)

In [None]:
var_cumu = np.cumsum(pca.explained_variance_ratio_)
var_cumu 

In [None]:
fig = plt.figure(figsize=[12,8])
plt.vlines(x=10, ymax=1, ymin=0, colors="r", linestyles="--")
plt.hlines(y=0.94, xmax=12, xmin=0, colors="g", linestyles="--")
plt.plot(var_cumu)
plt.ylabel("Cumulative variance explained")
plt.show()

In [None]:
pca = PCA(n_components=10,random_state=42)

#Scale and transform data to get Principal Components

# fit_transform and transform to get the reduced data
Xtrain_reduced = pca.fit_transform(Xtrain)
Xtest_reduced = pca.transform(Xtest)


regrpca = linear_model.LinearRegression()
# Train the model using the principal components of the transformed training sets
regrpca.fit(Xtrain_reduced, y_train)
# Make predictions using the principal components of the transformed testing set
y_pred = regrpca.predict(Xtest_reduced)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))



In [None]:
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))