# Problem statement
__Prediction Question:__ <br>
How accurately can I predict the price of a house, given the values of all variables? 

In [None]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [None]:
# Load the data

df = pd.read_csv('/kaggle/input/others/House_Price.csv')
df.head()

In [None]:
# Check the shape of the dataset

print(df.shape)
print("Number of rows: ",df.shape[0])
print("Number of columns: ",df.shape[1])

In [None]:
df.info()

In [None]:
 df.describe()

In [None]:
df.skew()

##### Univariate Analysis On the Continuous variables:

In [None]:
df_continuous = [col for col in df.select_dtypes(include=[np.number])]
print("Continuous variables: \n",df_continuous)

In [None]:
sns.jointplot(x='crime_rate',y='price',data=df)

As "crime_rate" increases the price drops. 

In [None]:
sns.jointplot(x='resid_area',y='price',data=df)

In [None]:
sns.jointplot(x='air_qual',y='price',data=df)

In [None]:
sns.jointplot(x='room_num',y='price',data=df)

In [None]:
sns.jointplot(x='teachers',y='price',data=df)

In [None]:
sns.jointplot(x='n_hot_rooms',y='price',data=df)

In [None]:
sns.jointplot(x='rainfall',y='price',data=df)

- As the crime_rate increase the "price" decreases. There might be negative colinearity between these features.<br>
- As the "room_num" increases the "price" increases too.<br>
- In "n_hot_rooms" variable, most of the data lies between 1 & 20 but there are 2 outliers.<br>
- "rainfall" variable has most of the data between 20 & 60 but there is one outlier below 10 which might be a sampling error or calculation error. 

##### Univariate Analysis On the Categorical variables:

In [None]:
df_categorical = [col for col in df.select_dtypes(include=[np.object])]
print("Categorical variables: \n",df_categorical)

In [None]:
sns.countplot(x='airport',data=df)

In [None]:
sns.countplot(x='waterbody',data=df)

In [None]:
sns.countplot(x='bus_ter',data=df)

__Outlier Treatment__

In [None]:
np.percentile(df['n_hot_rooms'],[99])

In [None]:
upper_limit = np.percentile(df['n_hot_rooms'],[99])[0]
upper_limit

In [None]:
df[(df['n_hot_rooms']>upper_limit)]

In [None]:
# n=3

df['n_hot_rooms'][(df['n_hot_rooms']>3*upper_limit)]=3*upper_limit

In [None]:
lower_limit = np.percentile(df.rainfall,[1])[0]
lower_limit

In [None]:
df[(df.rainfall<lower_limit)]

In [None]:
df.rainfall[(df.rainfall<0.3*lower_limit)]=0.3*lower_limit

In [None]:
sns.jointplot(x='crime_rate', y='price',data=df)

In [None]:
df.describe()

In [None]:
df.info()

 __Missing Value Treatment__

In [None]:
df["n_hos_beds"]=df["n_hos_beds"].fillna(df["n_hos_beds"].mean())

In [None]:
df.info()

__Bivariate Analysis__

In [None]:
sns.jointplot(x = 'crime_rate', y ='price',data=df)

In [None]:
df['crime_rate'] = np.log(1+df['crime_rate'])

In [None]:
sns.jointplot(x = 'crime_rate', y ='price',data=df)

In [None]:
df['avg_dist'] = (df['dist1']+df['dist2']+df['dist3']+df['dist4'])/4

In [None]:
df.describe()

In [None]:
df.drop(['dist1','dist2','dist3','dist4','bus_ter'],axis=1,inplace=True)

In [None]:
# create dummy variables

df = pd.get_dummies(df)

In [None]:
df.head()

In [None]:
df.drop(['airport_NO','waterbody_None'],inplace=True,axis=1)

In [None]:
df.head(1)

In [None]:
df.corr()

In [None]:
df.drop('parks',inplace=True,axis=1)

In [None]:
df.head(1)

__Linear Regression model__

In [None]:
import statsmodels.api as sn

In [None]:
X = sn.add_constant(df['room_num'])

In [None]:
lm= sn.OLS(df['price'],X).fit()

In [None]:
lm.summary()

In [None]:
# another method

from sklearn.linear_model import LinearRegression

In [None]:
y = df['price']

In [None]:
X = df[['room_num']]

In [None]:
lm2 = LinearRegression()

In [None]:
lm2.fit(X,y)

In [None]:
print('intercept: ',lm2.intercept_,'\ncoefficient: ',lm2.coef_)

In [None]:
lm2.predict(X)

Predicted house prices.

In [None]:
sns.jointplot(x=df['room_num'],y=df['price'],data=df,kind='reg')

In [None]:
X_multi = df.drop('price',axis=1)

In [None]:
y_multi = df['price']

In [None]:
y_multi.head()

In [None]:
X_multi_cons = sn.add_constant(X_multi)

In [None]:
X_multi_cons.head()

In [None]:
lm_multi = sn.OLS(y_multi,X_multi_cons).fit()

In [None]:
lm_multi.summary()

In [None]:
lm3 = LinearRegression()

In [None]:
lm3.fit(X_multi,y_multi)

In [None]:
print(lm3.intercept_,lm3.coef_)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_multi,y_multi,test_size = 0.2,random_state=0)

In [None]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
lm_a = LinearRegression()

In [None]:
lm_a.fit(X_train,y_train)

In [None]:
y_test_a = lm_a.predict(X_test)

In [None]:
y_train_a = lm_a.predict(X_train)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test,y_test_a)

In [None]:
r2_score(y_train,y_train_a)

With the Base model we are getting a accuracy of 54%.