# House price Analysis using Linear regression

**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Understand the data**

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
df.columns

In [None]:
#lets check if there is any null value present in dataset
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['price']

### Correlation

**Now we check the correlation of all features with price**

Negative correlation means they does not affect the prices of houses

In [None]:
df.corrwith(df['price'])

**Find out about correlation between features with help oh heatmap**

In [None]:
fig, ax = plt.subplots(figsize=(15, 11))
sns.heatmap(df.corr(),annot = True ,cmap ='RdYlGn')

## Data Visualization

In [None]:
sns.distplot(df['price'])

In [None]:
fig, ax= plt.subplots(figsize=(27,30), ncols=3, nrows=7)
sns.scatterplot(x="bedrooms", y="price",data=df, ax=ax[0][0])
sns.scatterplot(x="bathrooms", y="price",data=df, ax=ax[0][1])
sns.scatterplot(x="sqft_living", y="price",data=df, ax=ax[0][2])
sns.scatterplot(x="sqft_lot", y="price",data=df, ax=ax[1][0])
sns.scatterplot(x="floors", y="price",data=df, ax=ax[1][1])
sns.scatterplot(x="waterfront", y="price",data=df, ax=ax[1][2])
sns.scatterplot(x="view", y="price",data=df, ax=ax[2][0])
sns.scatterplot(x="condition", y="price",data=df, ax=ax[2][1])
sns.scatterplot(x="grade", y="price",data=df, ax=ax[2][2])
sns.scatterplot(x="sqft_above", y="price",data=df, ax=ax[3][0])
sns.scatterplot(x="sqft_basement", y="price",data=df, ax=ax[3][1])
sns.scatterplot(x="yr_built", y="price",data=df, ax=ax[3][2])
sns.scatterplot(x="yr_renovated", y="price",data=df, ax=ax[4][0])
sns.scatterplot(x="zipcode", y="price",data=df, ax=ax[4][1])
sns.scatterplot(x="lat", y="price",data=df, ax=ax[4][2])
sns.scatterplot(x="long", y="price",data=df, ax=ax[5][0])
sns.scatterplot(x="sqft_living15", y="price",data=df, ax=ax[5][1])
sns.scatterplot(x="sqft_lot15", y="price",data=df, ax=ax[5][2])
sns.scatterplot(x="id", y="price",data=df, ax=ax[6][0])
sns.scatterplot(x="date", y="price",data=df, ax=ax[6][1])

plt.show();


## Data Cleaning 

**Now we clean our data, we remove all unnecassary columns on the basis of**
* Remove all the coulums which have negative values in correlation Heatmap
* Remove all the columns which does not affect price as we have checked in data visualization 

In [None]:
df.drop(['id','date','zipcode','condition','long','sqft_lot15','yr_built','sqft_lot','view','waterfront','yr_renovated'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.columns

## Machine Learning Algorithims

Now we apply Linear Regression algorithms to predict the House price

## Training and Testing the data

Let's now begin to train out our model! We will need to first split up our data into an X array that contains the features to train on, and a y array with the target variable, in this case the Price column  

In [None]:
x = df [['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'grade','sqft_above', 'sqft_basement', 'lat', 'sqft_living15']]

y = df[['price']]

Now let's split the data into a training set and a testing set. We will train out model on the training set and then use the test set to evaluate the model.

In [None]:
from sklearn.model_selection import train_test_split 

X_train, x_test, Y_train, y_test = train_test_split(x, y, test_size=0.4)

## Creating the Training Model

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, Y_train)

## Model Evalution

In [None]:
print(lm.intercept_)

In [None]:
model = []
score = []

model_pred = []
score_pred = []

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import  cross_val_score,cross_val_predict
cv_LinReg=cross_val_score(lm,X_train,Y_train,cv=10)
cv_LinReg_pred=cross_val_predict(lm,X_train,Y_train,cv=10)

model.append("Linear Regression")
score.append(r2_score(Y_train,cv_LinReg_pred))

#Visualisation
fig,ax=plt.subplots(figsize=(10, 5))
plt.rc('font')   
ax=sns.distplot(Y_train,hist=False,label='Y-Train',color='r')
sns.distplot(cv_LinReg_pred,hist=False,label='Pred-CV-Value',color='black',ax=ax)
plt.title("CV-prediction with LinearRegression")