### **Importing libraries**

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline

### **Get the data**

In [None]:
data_frame = pd.read_csv('../input/housesalesprediction/kc_house_data.csv', sep=',', quotechar='"')

In [None]:
print(data_frame.shape)

In [None]:
data_frame.head()

In [None]:
data_frame.info()

### **Get the training features as X and the target label as Y**

In [None]:
X = data_frame.drop(columns = ['id','price','date'], axis= 1)
Y = data_frame['price']/10000
print(X.shape)
print(Y.shape)

In [None]:
X.head()

In [None]:
Y.head()

### **Split traning and test data**

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state=0)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

### **Relation between Features and Price**

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(131)
plt.bar(X_train['bedrooms'], Y_train)
plt.xlabel('bedrooms')
plt.subplot(132)
plt.bar(X_train['bathrooms'], Y_train)
plt.xlabel('bathrooms')
plt.subplot(133)
plt.bar(X_train['sqft_living'], Y_train)
plt.xlabel('sqft_living')
plt.show()

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(131)
plt.bar(X_train['sqft_lot'], Y_train)
plt.xlabel('sqft_lot')
plt.subplot(132)
plt.bar(X_train['floors'], Y_train)
plt.xlabel('floors')
plt.subplot(133)
plt.bar(X_train['waterfront'], Y_train)
plt.xlabel('waterfront')
plt.show()


In [None]:
plt.figure(figsize=(14,5))
plt.subplot(131)
plt.bar(X_train['view'], Y_train)
plt.xlabel('view')
plt.subplot(132)
plt.bar(X_train['condition'], Y_train)
plt.xlabel('condition')
plt.subplot(133)
plt.bar(X_train['grade'], Y_train)
plt.xlabel('grade')
plt.show()

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(131)
plt.bar(X_train['sqft_above'], Y_train)
plt.xlabel('sqft_above')
plt.subplot(132)
plt.bar(X_train['sqft_basement'], Y_train)
plt.xlabel('sqft_basement')
plt.subplot(133)
plt.bar(X_train['yr_built'], Y_train)
plt.xlabel('yr_built')
plt.show()

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(131)
plt.bar(X_train['yr_renovated'], Y_train)
plt.xlabel('yr_renovated')
plt.subplot(132)
plt.bar(X_train['zipcode'], Y_train)
plt.xlabel('zipcode')
plt.subplot(133)
plt.bar(X_train['lat'], Y_train)
plt.xlabel('lat')
plt.show()

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(131)
plt.bar(X_train['long'], Y_train)
plt.xlabel('long')
plt.subplot(132)
plt.bar(X_train['sqft_living15'], Y_train)
plt.xlabel('sqft_living15')
plt.subplot(133)
plt.bar(X_train['sqft_lot15'], Y_train)
plt.xlabel('sqft_lot15')
plt.show()

### **Build Linear regression Model**

In [None]:
linearReg_model = LinearRegression(normalize = True, copy_X= True)

linearReg_model.fit(X_train,Y_train)

Y_pred_linearReg_model = linearReg_model.predict(X_test)

### **Evulate the Linear regression Model**

In [None]:
score_LinearRegTrain = linearReg_model.score(X_train,Y_train)

score_LinearRegTest = linearReg_model.score(X_test,Y_test)

print('R squared for training data =',score_LinearRegTrain)
print('R squared for test data =',score_LinearRegTest)

In [None]:
Mean_square = mean_squared_error(Y_test, Y_pred_linearReg_model)
print('Mean Squaed error =',Mean_square)

In [None]:
Mean_absolute = mean_absolute_error(Y_test, Y_pred_linearReg_model)
print('Mean absolute error =',Mean_absolute)

### **Select important features Using SelectPercentile**

In [None]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif

In [None]:
feature_selection_Percentile = SelectPercentile(score_func = f_classif, percentile = 70 )
X_Percentile = feature_selection_Percentile.fit_transform(X,Y)

In [None]:
X_Percentile.shape

In [None]:
print(feature_selection_Percentile.get_support())

In [None]:
X_train_Percentile,X_test_Percentile,Y_train_Percentile,Y_test_Percentile = train_test_split(X_Percentile,Y,test_size = 0.3,random_state=0)

In [None]:
linearReg_model_Percentile = LinearRegression(normalize = True, copy_X= True)

linearReg_model_Percentile.fit(X_train_Percentile,Y_train_Percentile)

Y_pred_Percentile = linearReg_model_Percentile.predict(X_test_Percentile)

In [None]:
score_Train_Percentile = linearReg_model_Percentile.score(X_train_Percentile,Y_train_Percentile)

score_Test_Percentile = linearReg_model_Percentile.score(X_test_Percentile,Y_test_Percentile)

print('R squared for training data =',score_Train_Percentile)
print('R squared for test data =',score_Test_Percentile)

### **Select important features Using SelectFromModel**

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
feature_selection_FromModel = SelectFromModel(estimator= linearReg_model)
X_FromModel = feature_selection_FromModel.fit_transform(X,Y)

In [None]:
X_FromModel.shape

In [None]:
print(feature_selection_FromModel.get_support())

In [None]:
X_train_FModel, X_test_FModel, Y_train_FModel, Y_test_FModel = train_test_split(X_FromModel,Y,test_size = 0.3,random_state=0)

In [None]:
linearReg_model_FModel = LinearRegression(normalize = True, copy_X= True)

linearReg_model_FModel.fit(X_train_FModel ,Y_train_FModel )

Y_pred_FModel = linearReg_model_FModel.predict(X_test_FModel )

In [None]:
score_Train_FModel = linearReg_model_FModel.score(X_train_FModel,Y_train_FModel)

score_Test_FModel = linearReg_model_FModel.score(X_test_FModel,Y_test_FModel)

print('R squared for training data =',score_Train_FModel)
print('R squared for test data =',score_Test_FModel)