In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
data=pd.read_csv('/kaggle/input/housing-prices-in-metropolitan-areas-of-india/Hyderabad.csv')

In [None]:
data.head(5)

In [None]:
data.describe() #Descriptive analysis

In [None]:
data.info() 

<p>The dataset has target variable <b>Price</b> and all other are independent variables<br>
All independent variables are boolean in nature except Area, No.of.Bedrooms and location also the null values have been represented as 9 in boolean columns, there are no blank or missing data from count</p>

In [None]:
plt.scatter(x=data['Area'],y=data['Price'],alpha=0.5)
plt.ylabel('Rupees of Order 10 Crores')
plt.xlabel('Area')
plt.title('outlier points at area > 6000 & price at  16Cr')

In [None]:
sb.boxplot(x=data['Price'])

In [None]:
plt.hist(data['Area'],bins=12)
plt.title('Most of House areas in dataset are under 2000')

In [None]:
len(data['Location'].unique()) 
#There are 243 Unique Locations 

In [None]:
data.groupby('Location')['Price'].count().sort_values(ascending=False)[0:10]
#Top 30 Locations of houses from the dataset

In [None]:
Location_text = " ".join(data.Location)
wordcloud = WordCloud(width=720, height=360,collocations=False).generate(text=Location_text)
plt.figure(figsize=(30,18))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
#wc=WordCloud().generate(text=Location_text)

In [None]:
#Location_text

<h2>Data Cleaning</h2>
Outliers and Null Values (9 in this data set)

In [None]:
#from sklearn.impute import SimpleImputer
#imp=SimpleImputer(missing_values=9,strategy=median)

In [None]:
# Replacing 9 with Null values
data[5:]=data[5:].replace(to_replace=9,value=np.nan)
data.isnull().sum()

In [None]:
len(data) #Before removing null values

In [None]:
data=data[5:].dropna(axis=0)
len(data)
data.isnull().sum()

In [None]:
len(data) #After removing null values

In [None]:
#To impute data rather than removing nulls
"""
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=9,strategy='most_frequent')
cols=data.columns[5:]
data[cols]=imp.fit_transform(data[cols])
"""

In [None]:
data.describe() #the data is clean

In [None]:
#data.to_csv('Hyderabad-vcr.csv')

Imputing Outliers

In [None]:
q1=data['Price'].quantile(0.25)
q3=data['Price'].quantile(0.75)
irp=q3-q1
low=q1-1.5*irp
upr=q3+1.5*irp
#low=data['Price'].min() #since low was neg
low,upr

In [None]:
def imp(val):
    if val>upr:
        return upr
    if val<low:
        return low
    else:
        return val
data['Price']=data['Price'].apply(imp)

In [None]:
plt.scatter(x=data['Area'],y=data['Price'],alpha=0.5)

In [None]:
sb.boxplot(x=data['Price'])

In [None]:
plt.subplots(figsize = (25, 20))
colormap= sb.diverging_palette(220, 10, as_cmap = True)
sb.heatmap(data.corr(), annot=True, cmap = colormap)

## Transforming location variable 

In [None]:
#For location it has many unique values and has alternate hypothesis so we need to bin the values to fewer groups
#For that we are going to find the mean Price for each location and sorting them in ascending order
Location_table=data.groupby('Location').agg({'Price':'mean'}).sort_values('Price',ascending=True)

In [None]:
Location_table.head()

In [None]:
Location_table['Loc']=pd.cut(Location_table['Price'],bins=10,labels=['G0',
                                                          'G1',
                                                          'G2',
                                                          'G3',
                                                          'G4',
                                                          'G5',
                                                          'G6',
                                                          'G7',
                                                          'G8',
                                                          'G9'],
                           include_lowest=True)

In [None]:
Location_table['Loc'].head()

In [None]:
Location_table=Location_table.drop(columns="Price")
#TO merge two tables we use "merge" function from pandas using zipcode as identifier
data=pd.merge(data,Location_table,
                left_on='Location',
                how='left',
                right_index=True)
data.drop(columns='Location',inplace=True)

In [None]:
data.head()

In [None]:
#now create dummies for Location
data=pd.get_dummies(data,columns=['Loc'],drop_first=True)
data.head()

In [None]:
#data.to_csv('Hyderabad-vtcr.csv')

## Implementing Linear Regression

In [None]:
from sklearn.preprocessing import StandardScaler as ss
#Scaling the data set
scalar=ss()
Y=data['Price']
# Scaling
X=scalar.fit_transform(data.drop(columns=['Price']))
# Converting to pandas dataframe for easy manipulation
X=pd.DataFrame(data=X,columns=data.drop(columns=['Price']).columns)
X.head()

In [None]:
# Checking multicollinearity
k=X.corr()
z=[[str(i),str(j)] for i in k.columns for j in k.columns if (k.loc[i,j]>abs(0.5)) & (i!=j)]
z,len(z)

In [None]:
#Caluclating VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
VIF=pd.Series([vif(X.values,i) for i in range (X.shape[1])],index=X.columns)
VIF

In [None]:
# We need to remove vif > 5 and each time we remove one column the vif data changes so we remove,check vif,remove
def MC_rem(data):
    VIF=pd.Series([vif(data.values,i) for i in range (data.shape[1])],index=data.columns)
    if(VIF.max()>5):
        data.drop(columns=[VIF[VIF==VIF.max()].index[0]],inplace=True)
        print(VIF[VIF==VIF.max()].index[0],'has been removed from "X_copy"')
        return data
    else:
        print('no multicollinearity')
        return data

In [None]:
X_copy=X.copy()
for i in range(5):
    X_copy=MC_rem(X_copy)
X=X_copy
###After Removing collinearity
VIF=pd.Series([vif(X_copy.values,i) for i in range (X_copy.shape[1])],index=X_copy.columns)
VIF

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=101)

X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression 
lr=LinearRegression(normalize=True)
# If norm=true intercept=0
lr.fit(X_train,Y_train)

In [None]:
predictions=lr.predict(X_test)
lr.score(X_test,Y_test)

In [None]:
residuals=predictions-Y_test
residual_table=pd.DataFrame({'residual':residuals,
                            'prediction':predictions})
residual_table=residual_table.sort_values(by='prediction')
z=[i for i in range(int(residual_table['prediction'].max()))]
k=[0 for i in range(int(residual_table['prediction'].max()))]

In [None]:
plt.figure(dpi=130,figsize=(17,7))
plt.scatter(residual_table['prediction'],residual_table['residual'],color='red',s=25)
plt.plot(z,k,color='green',linewidth=3,label='Regression line')
plt.ylim(-800000,800000)
plt.xlabel('Fitted points(ordered by predictions)')
plt.ylabel('Residuals')
plt.title('residual plot')
plt.legend()
plt.show()

In [None]:
# Plotting the distribution of errors
plt.figure(dpi=100,figsize=(10,7))
plt.hist(residual_table['residual'],color='red',bins=200)
plt.xlabel('Residuals')
plt.ylabel('frequency')
plt.title('Distribution of residuals')
plt.show()

In [None]:
coefftab=pd.DataFrame({
    'column':X_train.columns,
    'coeff':lr.coef_
})
coefftab=coefftab.sort_values(by='coeff')

In [None]:
plt.figure(figsize=(8,6),dpi=120)
x=coefftab['column']
y=coefftab['coeff']
plt.barh(x,y)
plt.xlabel('Coefficients')
plt.ylabel('variables')
plt.title('Normalised Coefficient Plot')
plt.show()