In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df= pd.read_csv('../input/dataset/House_Price.csv')
df.head()

In [None]:
df.shape

In [None]:
df['waterbody'].value_counts()

In [None]:
df.describe() # EDD

In [None]:
sns.jointplot(x = 'n_hot_rooms', y ='price', data =df) # variable n_hot_rooms looks like outliers

In [None]:
sns.jointplot(x ='rainfall', y ='price', data= df) # outliers in rainfall

In [None]:
# Categorical variables- airport & 
sns.countplot(x = 'airport', data =df)

In [None]:
sns.countplot(x = 'waterbody', data =df)

#Observations
1. missing values in n_hos_beds
2. skewness or outliers in crime rate
3. outliers in n_hot_rooms and rainfall
4. bus-ter has only 'yes' values


#Outliers treatment
#------- Outliers detection---------------------------------------------
iqr = df
Q1 = df[['n_hot_rooms','rainfall']].quantile(0.25)
Q3 = df[['n_hot_rooms','rainfall']].quantile(0.75)


IQR = Q3 - Q1
print(IQR)

lower_bound = Q1-IQR*1.5
upper_bound = Q3+IQR*1.5
print(lower_bound,upper_bound)

#print(df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))

#Step3: Remove the outliers using the IQR score
df = df[~((df < (Q1 - 1.5 * IQR)) |(df  > (Q3 + 1.5 * IQR))).any(axis=1)]

print("The no. of rows before outlier filtering was: ", df.shape)
print("The no. of rows after outlier filtering is: ", df_out.shape)

In [None]:
np.percentile(df.n_hot_rooms,[99])[0]  #99 percentile of variable n_hot_rooms- [0] fetching 1st no
uv = np.percentile(df.n_hot_rooms,[99])[0]  # upper limit or upper value
df[(df.n_hot_rooms > uv)]
df.n_hot_rooms[(df.n_hot_rooms > 3*uv)]= 3*uv


lv = np.percentile(df.rainfall,[1])[0]
df[(df.rainfall< lv)]
df.rainfall[(df.rainfall < 0.3*lv)] = 0.3*lv

In [None]:
df.describe()

In [None]:
# Missing value imputation with mean- variable 
df.info()

In [None]:
df.isna().sum()

In [None]:
# missing value replace with mean for n_hos_beds variable

df.n_hos_beds = df.n_hos_beds.fillna(df.n_hos_beds.mean()) 

#df = df.fillna(df.mean()) in case we want to impute for all variables

In [None]:
df.isna().sum() # Missing value replaced

In [None]:
#Transforming crime_rate variable
sns.jointplot(x ='crime_rate', y ='price', data = df)  #curve looks like logarithmic

In [None]:
#Using log function to transform
df.crime_rate = np.log(1+ df.crime_rate)
sns.jointplot(x ='crime_rate', y ='price', data = df) # here the relationship is somehow linear after transformation

In [None]:
# These four variable gives the same information so we will create one var by taking avg  dist1, dist2, dist3, dist4 

df['avg_dist'] = (df.dist1 + df.dist2 + df.dist3 + df.dist4)/4
df.head()

In [None]:
#Removing the varibles dist1, dist2, dist3, dist4, bus_ter

df = df.drop(['dist1','dist2','dist3', 'dist4','bus_ter'], axis = 1)
df.head()


In [None]:
# Creating dummies variables for categorical variables

df = pd.get_dummies(df)

In [None]:
df.head()

In [None]:
df = df.drop(['airport_NO','waterbody_None'], axis =1)

In [None]:
df.head()

# CORRELATION MATRIX

In [None]:
import seaborn as sns
#df.corr()
data_cor = df.corr()
plt.figure(figsize =(15,6))
g = sns.heatmap(data_cor, annot = True, cmap ="RdYlGn")

In [None]:
df= df.drop(['parks'], axis =1) # Parks and air_qual are highly correlated > 0.8 removed one to avoid multicollinearity

In [None]:
df.head()

# SIMPLE LINEAR REGRESSION - OLS METHOD

In [None]:
import statsmodels.api as sn
X = sn.add_constant(df["room_num"])
lm = sn.OLS(df['price'],X).fit()
lm.summary()

In [None]:
#Machine learning Method

from sklearn.linear_model import LinearRegression
y =df['price']
X = df[['room_num']]

In [None]:
lm2 = LinearRegression()

In [None]:
lm2.fit(X,y)

In [None]:
print(lm2.intercept_, lm2.coef_)

In [None]:
lm2.predict(X)

In [None]:
sns.jointplot(x = df['room_num'], y = df['price'], data = df, kind = 'reg')  #help(sns.jointplot)

# Multiple Linear Regression

In [None]:
df.head()

In [None]:
#Defining X and Y

X_multi = df.drop('price', axis =1) # axis= 1 for dropping column, axis = 0 for row
y_multi = df['price']
X_multi.head()

In [None]:
X_multi_cons = sn.add_constant(X_multi)

In [None]:
X_multi_cons.head()

In [None]:
lm_multi = sn.OLS(y_multi, X_multi_cons).fit()
lm_multi.summary()

In [None]:
lm3 = LinearRegression()
lm3.fit(X_multi, y_multi)
print(lm3.intercept_, lm3.coef_)

# SPLITTING TRAIN & TEST DATA

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_multi, y_multi, test_size = 0.2, random_state = 0)
print(X_train.shape, X_test.shape)


In [None]:
# Call a model
lm_a = LinearRegression()

# fit the model
lm_a.fit(X_train, y_train)

# predict the model

y_test_a = lm_a.predict(X_test)
#print(y_test_a)

y_train_a = lm_a.predict(X_train)
#print(y_train_a)

In [None]:
#Performance metrics #--- Overfitting model

from sklearn.metrics import r2_score
print("R squared value for test :", r2_score(y_test, y_test_a))
print("R squared value for train:", r2_score(y_train, y_train_a))

# RIDGE & LASSO REGRESSION

In [None]:
# transform with standard scaler
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
lm_r = Ridge(alpha = 0.5)
lm_r.fit(X_train_s, y_train)
lm_r.predict(X_test_s)

In [None]:
print("R squared after Ridge:", r2_score(y_test, lm_r.predict(X_test_s)))

In [None]:
# HYPER PARAMETER TUNING

from sklearn.model_selection import validation_curve

param_range = np.logspace(-2,8,100)  # creating 100 values between 10*-2 and 10*8- trying to fit the best value of alpha
param_range

In [None]:
#validation_curve? to see the criteria of validation curve

train_scores, test_scores = validation_curve(Ridge(), X_train_s, y_train, "alpha", param_range, scoring = 'r2')

In [None]:
print(train_scores)
print(test_scores)

In [None]:
train_mean = np.mean(train_scores, axis=1)  #taking mean of 5 results above of each value
train_mean # 100 r squared value of 100 alpha or lambda

In [None]:
test_mean = np.mean(test_scores, axis =1)
test_mean

In [None]:
# taking out the highest r squared value
print('highest R squared-train',max(train_mean))
print('highest R squared-test',max(test_mean))

In [None]:
#Plot the r square values v/s lambda(alpha)

sns.jointplot(x = np.log(param_range), y = test_mean)


In [None]:
# locating where optimum value of r squared

np.where(test_mean == max(test_mean))

# maximum R-squared value lies in 31 index of alpha

In [None]:
param_range[31]

In [None]:
# Since we get best lambda or alpha value, let's build the model with this

lm_r_best = Ridge(alpha = param_range[31])
lm_r_best.fit(X_train_s, y_train)

In [None]:
print('R squared of test', r2_score(y_test, lm_r_best.predict(X_test_s)))
print('R squared of train', r2_score(y_train, lm_r_best.predict(X_train_s)))

# there is no improvement in the R-squared value - since the dataset is very less.

In [None]:
from sklearn.linear_model import Lasso
lm_l = Lasso(alpha = 0.5)
lm_l.fit(X_train_s, y_train)
lm_l.predict(X_test_s)

In [None]:
print("R squared after Lasso:", r2_score(y_test, lm_l.predict(X_test_s)))
# continue the same process like we did in Ridge but considering R squared is too less, not going through 