# Predict the price of houses using Linear Regression

### Here we are trying to predict the house prices (output) based on different features. 

## 1. Import the required libraries

In [None]:
import numpy as np   #Linear algera Library
import pandas as pd
import matplotlib.pyplot as plt  #to plot graphs
import seaborn as sns  #to plot graphs
from sklearn.linear_model import LinearRegression   #for linear regression model
sns.set()  #setting seaborn as default 

import warnings
warnings.filterwarnings('ignore')

## 2. Read the input data

In [None]:
data=pd.read_csv('../input/housing-dataset/Housing.csv')   #reads the input data
data.head()   #displays the first five rows

## 3. Understand your data

In [None]:
data.info()

In [None]:
data.describe(include ='all')   #parameter include=all will display NaN values as well

## 4. Check for NULL values

In [None]:
data.isnull().sum() # No null values

## 5. Data Preparation

### a) YES/NO categories

It is seen that there are some columns with categorical values like 'YES' or 'NO'. We need to change them to 0 and 1

In [None]:
#first fetch all the categorical columns with Yes and NO
categorical =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
#write a function to change yes to 1 and no to 0
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# now replace yes and no with 1 and 0 in our dataset
data[categorical] = data[categorical].apply(binary_map)



In [None]:
data.head()

### b) Dummy variable

Dummy Variables - Now the last column(furnishingstatus) has 3 categories i.e. furnished,semi-furnished and unfurnished. We need to convert this to numbers as well

In [None]:
table = pd.get_dummies(data['furnishingstatus'])   #add the column into table variable
table.head()

furnished will be 00 and to avoid redudency we drop it. semi-furnished will be 10 and unfurnished will be 01

In [None]:
table = pd.get_dummies(data['furnishingstatus'], drop_first = True)  #recreate table but now drop the first column(furnished)
table.head()

In [None]:
data = pd.concat([data, table], axis = 1)  #attach the other two columns to our data set
data.head()

In [None]:
data.drop(['furnishingstatus'], axis = 1, inplace = True) #drop the old column from the dataset
data.head()

## 6. See the plots on a graph 

In [None]:
sns.pairplot(data)
plt.show()

## 7. Split data into Training and Testing data

In [None]:
data.columns

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0) #so data can have same values
df_train, df_test = train_test_split(data, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
df_train.head()


## 8. Scaling Training Data: MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler    #to make all the numbers to the same scale
scaler = MinMaxScaler()

In [None]:
var_to_scale = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']
#appied scale to all numerical columns(not the yes/no and dummy columns)

In [None]:
#apply the scaled values to our training data set
df_train[var_to_scale] = scaler.fit_transform(df_train[var_to_scale])  


In [None]:
df_train.head()

In [None]:
df_train.describe()

## 9. Train the model

In [None]:
# only output price is poped out of df_Train and put into y_train
y_train = df_train.pop('price') 
x_train = df_train

In [None]:
y_train.head()

In [None]:
#using linear regression
lm=LinearRegression()
lm.fit(x_train,y_train)

In [None]:
lm.coef_

In [None]:
#values from 0 to 1
#0 model explain None of the variability
#1 model explain Entire of the variability
lm.score(x_train,y_train)

## 10. Scaling Test Data: MinMaxScaler

In [None]:
var_to_scale = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']

In [None]:
df_test[var_to_scale] = scaler.fit_transform(df_test[var_to_scale])

## 11. Run model using Test data

In [None]:
y_test = df_test.pop('price')
x_test = df_test

In [None]:
#predict the output(predictions) using the test data
predictions = lm.predict(x_test)

## 12. check R squared value

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictions)

## 13. Compare the actual and predicted values

In [None]:

#AttributeError: 'Series' object has no attribute 'flatten' --to avoid this error in the next step
y_test.shape
y_test_matrix = y_test.values.reshape(-1,1)


In [None]:
#load actual and predecited values side by side
dframe=pd.DataFrame({'actual':y_test_matrix.flatten(),'Predicted':predictions.flatten()}) 
#flatten toget single axis of data (1 dimension only)

In [None]:
dframe.head(15)

## 14. Plot Graph

In [None]:
#using scatter plot compare the actual and predicted data
fig = plt.figure()
plt.scatter(y_test,predictions)
plt.title('Actual versus Prediction ')
plt.xlabel('Actual', fontsize=20)                         
plt.ylabel('Predicted', fontsize=20)                         

In [None]:
#trying the same with a reg plot(optonal)
sns.regplot(y_test,predictions)
plt.title('Actual versus Prediction ')
plt.xlabel('Actual', fontsize=20)                         
plt.ylabel('Predicted', fontsize=20)   