In [None]:
import pandas as pd                                        #For reading dataframes
import numpy as np                                         #For linear regression
import seaborn as sns                                      #For regression plot and heatmap
from sklearn.model_selection import train_test_split       
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

# EXPLORATORY DATA ANALYSIS

In [None]:
df = pd.read_csv('../input/delhi-house-price-prediction/MagicBricks.csv')
df.head()                                                 #Taking a initial look at the dataset

In [None]:
print(df.dtypes)                                         #Understanding the data types of each columns
# We find that Bathroom,Parking has float values which we have to convert to integers.

In [None]:
df.shape                                                #To find how many data is present in this excel

In [None]:
df.describe()

In [None]:
df.isnull().sum()                                       #Finding how many null values in the dataframe

In [None]:
sns.heatmap(df.isnull())
#We can find that there are lots of missing values in per_sqft, thus we are dropping it.
#There are two missing values in bathroom which we can replace with mode function.

In [None]:
df.mode()                                                          #Using mode function to find mode of each variables

In [None]:
df['Type'].value_counts().to_frame()
#Since Builder_Floor has the highest occurances,we are replacing NA type with Builder Floor

In [None]:
df['Furnishing'].value_counts().to_frame()
#Semi furnished gives the highest occurance

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
sns.regplot(x="Parking", y="Price", data=df)
#Parking does not sound like a good predictor of price as data is far from fitted line

In [None]:
#Replacing Bathroom,parking,Type,Furnishing column with their modes:
df['Bathroom'].fillna(value = 2.0, inplace = True)
df['Parking'].fillna(value = 1.0, inplace = True)
df['Type'].fillna(value = "Builder_Floor" , inplace = True)
df['Furnishing'].fillna(value = "Semi-Furnished" , inplace = True)
sns.heatmap(df.isnull())  #Verifying if none of the columns has null values

In [None]:
pearson_coef, p_value = stats.pearsonr(df['Parking'], df['Price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)  
#There is no linear correlation and since the p-value is  >  0.1: there is no evidence that the correlation is significant.

In [None]:
df.drop(['Parking'], axis=1,inplace=True)
df.head()

In [None]:
df.dropna(subset=['Per_Sqft'],axis=0, inplace=True)
df.head()

In [None]:
df.shape

In [None]:
sns.regplot(x="Per_Sqft", y="Price", data=df)

In [None]:
pearson_coef, p_value = stats.pearsonr(df['Per_Sqft'], df['Price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)  
#Since p value is less than 0.01, The correlation is pretty significant between price and per_sqft
#Thus dropping the missing rows is better option than filling it with mean value.

In [None]:
df[['Bathroom']] = df[['Bathroom']].astype("int")
print(df.dtypes)

In [None]:
df.head()                       #Bathroom and parking show int types now.

In [None]:
sns.heatmap(df.isnull()) 
#Finally the data has been cleaned and all the missing values has been removed/replaced.

# NUMERICAL FEATURES

In [None]:
df['Area'].hist(bins=10)
#We can see that most of the area lies between 0 to 5000

In [None]:
sns.regplot(y="Area", x="Price", data=df)
#Area is a good predictor as data points closely follows the regression plot

In [None]:
pearson_coef, p_value = stats.pearsonr(df['Area'], df['Price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value) 
#Since P value is very less, Area is statistically significant althought linear correlation isnt extremely strong

In [None]:
df.sort_values(by='Area', ascending=False).head()
#From the regression plot, the area from 14220.0 seems like an outlier and thus we will remove it for better accuracy

In [None]:
df = df[df.Area < 14220]
df.shape

In [None]:
df['BHK'].hist(bins=10)

In [None]:
sns.regplot(y="BHK", x="Price", data=df)

In [None]:
pearson_coef, p_value = stats.pearsonr(df['BHK'], df['Price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value) 
#BHK is statistically significant for predicting price

In [None]:
df['Bathroom'].hist(bins=10)

In [None]:
sns.regplot(y="Bathroom", x="Price", data=df)

In [None]:
pearson_coef, p_value = stats.pearsonr(df['Bathroom'], df['Price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value) 
#Bathroom is statistically significant for predicting price because of its low p value and high correlation coefficient

# DISCRETE FEATURES

In [None]:
df['Status'].value_counts().to_frame()

In [None]:
df['Locality'].value_counts().to_frame().shape
#We remove locality because it has too many column names

In [None]:
df.drop(['Locality'], axis=1,inplace=True)
df.head()

In [None]:
df['Transaction'].value_counts().to_frame()

In [None]:
df = pd.get_dummies(df)
df.head()

# MODEL DEVELOPMENT

In [None]:
lm = LinearRegression()                                 #Creating a Linear Regression object
lm

In [None]:
x = df[['Area', 'BHK', 'Bathroom', 'Per_Sqft',
       'Furnishing_Furnished', 'Furnishing_Semi-Furnished',
       'Furnishing_Unfurnished', 'Status_Almost_ready', 'Status_Ready_to_move',
       'Transaction_New_Property', 'Transaction_Resale', 'Type_Apartment',
       'Type_Builder_Floor']]
y = df['Price']

In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train ,y_test = train_test_split(x,y,test_size = 0.3,random_state=0)

In [None]:
linear = LinearRegression()
print(linear.fit(x_train,y_train))
print(linear.score(x_train,y_train))
print(linear.score(x_test,y_test))