# Data Science - Regression Project: Home Price Prediction in Banglore

Dataset from Kaggle : https://www.kaggle.com/amitabhajoy/bengaluru-house-price-data

In [15]:
import pandas as pd                               # Import the pandas library under the alias pd for data manipulation and analysis
import numpy as np                                # Import the NumPy library under the alias np for numerical computing and array operations
from matplotlib import pyplot as plt             # Import the pyplot module from the Matplotlib library for plotting, using the alias plt
%matplotlib inline                               # Enable inline plotting in Jupyter Notebooks, meaning plots will be displayed directly below the code cell
import matplotlib                                # Import the matplotlib library
matplotlib.rcParams["figure.figsize"] = (20,10)  # Set the default figure size for Matplotlib plots to (20, 10) inches

UsageError: unrecognized arguments: # Enable inline plotting in Jupyter Notebooks, meaning plots will be displayed directly below the code cell


### <span style="color:red;">Data Load: Loading banglore home prices dataset into a dataframe</span>

In [16]:
df1 = pd.read_csv("D:\Real_Estate_Prediction\Bengaluru_House_Data.csv")  # Read the CSV file "Bengaluru_House_Data.csv" into a DataFrame named df1
df1.head()                                                               # Display the first few rows of the DataFrame df1

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [17]:
df1.shape  # Retrieve the dimensions (number of rows and columns) of the DataFrame df1

(13320, 9)

In [18]:
df1.columns  # Retrieve the column labels of the DataFrame df1

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

#### Drop features that are not required to build our model

In [19]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')  # Drop the specified columns ('area_type','society','balcony','availability') from the DataFrame df1 and assign the result to df2
df2.head()                                                                       # Display the first few rows of the DataFrame df2

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [20]:
df2.shape

(13320, 5)

### <span style="color:red;">Data Cleaning: Handle NA values</span>

In [21]:
df2.isnull().sum()  # Calculate the number of missing values (NaN) for each column in the DataFrame df2

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [22]:
df3 = df2.dropna()  # Drop rows containing any missing values (NaN) from the DataFrame df2 and assign the result to df3
df3.isnull().sum()  # Calculate the number of missing values (NaN) for each column in the DataFrame df3

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [23]:
df3.shape  # Retrieve the dimensions (number of rows and columns) of the DataFrame df3

(13246, 5)

### <span style="color:red;">Feature Engineering</span>

#### Adding a new feature(integer) for bhk (Bedrooms Hall Kitchen) extracted from size

In [24]:
df3['size'].unique()  # Retrieve the unique values in the 'size' column of the DataFrame df3

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [25]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))  
# Create a new column 'bhk' in the DataFrame df3, extracting the number of bedrooms (BHK) from the 'size' column by splitting the string at the first space and converting the result to an integer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


In [26]:
df3.head()     # New Column bhk created

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [27]:
df3.bhk.unique()         # Retrieve the unique values in the 'bhk' column of the DataFrame df3

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [28]:
df3[df3.bhk > 20]  # Retrieve rows from the DataFrame df3 where the value in the 'bhk' column is greater than 20

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [29]:
df3.total_sqft.unique()  # Retrieve the unique values in the 'total_sqft' column of the DataFrame df3

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

Gives a range 1133-1384. So average is taken

In [30]:
def is_float(x):  # Define a function named is_float that takes a single argument x
    try:  # Begin a try block to attempt the following code
        float(x)  # Try to convert the input x to a floating-point number
    except:  # If an exception (error) occurs during the try block, execute the following code
        return False  # Return False if an exception occurs, indicating that the input x cannot be converted to a float
    return True  # If no exception occurs, return True, indicating that the input x can be converted to a float

In [31]:
df3[~df3['total_sqft'].apply(is_float)].head(8)  
# Filter the DataFrame df3 to retrieve rows where the values in the 'total_sqft' column cannot be converted to floats
# The .apply(is_float) method applies the is_float function to each value in the 'total_sqft' column, returning True if the value can be converted to a float and False otherwise
# The ~ operator negates the boolean mask, so ~df3['total_sqft'].apply(is_float) returns True for rows where the 'total_sqft' column contains values that cannot be converted to floats
# Finally, .head(10) is used to display the first 10 rows of the filtered DataFrame

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9


Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. 

In [32]:
def convert_sqft_to_num(x):
    tokens = x.split('-')  # Split the string at '-' to handle ranges
    if len(tokens) == 2:   # If the string contains a range
        return (float(tokens[0]) + float(tokens[1])) / 2  # Return the average of the min and max values
    try:
        return float(x)  # Try to convert the string to a float
    except:
        return None  # Return None if the conversion fails

In [33]:
convert_sqft_to_num('2110')  # Example usage of the convert_sqft_to_num function with a string input '2110'

2110.0

In [34]:
convert_sqft_to_num('2100 - 2850')  # Example usage of the convert_sqft_to_num function with a string input '2100 - 2850'

2475.0

In [41]:
convert_sqft_to_num('34.46Sq. Meter')  # Example usage of the convert_sqft_to_num function with a string input '34.46Sq. Meter'

In [37]:
df4 = df3.copy()  # Create a copy of the DataFrame df3 and assign it to df4
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)  # Apply the convert_sqft_to_num function to the 'total_sqft' column of df4 to convert values to numerical format
df4 = df4[df4.total_sqft.notnull()]  # Filter the DataFrame df4 to remove rows where 'total_sqft' values are null (NaN)
df4.head(3)  # Display the first 2 rows of the DataFrame df4

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3


In [38]:
df4.loc[30]  # Retrieve the row with index label 30 from the DataFrame df4

location      Yelahanka
size              4 BHK
total_sqft       2475.0
bath                4.0
price             186.0
bhk                   4
Name: 30, dtype: object

### <span style="color:red;">Feature Engineering</span>

In [43]:
df5 = df4.copy()  # Create a copy of the DataFrame df4 and assign it to df5
df5['price_per_sqft'] = df5['price'] * 100000 / df5['total_sqft']  # Calculate the price per square foot by dividing the price by the total square footage and multiplying by 100,000
df5.head()  # Display the first few rows of the DataFrame df5

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [47]:
df5.location.unique()  # Retrieve the unique values in the 'location' column of the DataFrame df5

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [49]:
len(df5.location.unique())  # Calculate the number of unique values in the 'location' column of the DataFrame df5

1298

##### Curse of Dimensionality

In [56]:
df5.location = df5.location.apply(lambda x: x.strip())  # Remove leading and trailing whitespace from each value in the 'location' column
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)  # Count the frequency of each unique value in the 'location' column, sorting in descending order
location_stats

location
Whitefield               533
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           264
Thanisandra              235
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Kannur                     1
whitefiled                 1
Name: location, Length: 1287, dtype: int64

In [61]:
location_stats.values.sum()  # Calculate the total count of properties across all locations

13200

In [62]:
len(location_stats[location_stats > 10])  # Calculate the number of locations with more than 10 properties

240

In [63]:
len(location_stats)  # Calculate the total number of unique locations

1287

In [64]:
len(location_stats[location_stats <= 10])  # Calculate the number of locations with 10 or fewer properties

1047

### <span style="color:red;">Dimensionality Reduction</span>

Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [65]:
location_stats_less_than_10 = location_stats[location_stats <= 10]  # Filter locations with 10 or fewer properties
location_stats_less_than_10  # Display the filtered location statistics

location
Sadashiva Nagar          10
Naganathapura            10
Basapura                 10
Nagadevanahalli          10
Kalkere                  10
                         ..
1 Giri Nagar              1
Kanakapura Road,          1
Kanakapura main  Road     1
Kannur                    1
whitefiled                1
Name: location, Length: 1047, dtype: int64

In [66]:
len(df5.location.unique())  # Calculate the total number of unique locations in the DataFrame df5

1287

In [67]:
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)  
# Replace locations with 10 or fewer properties with 'other' in the 'location' column of DataFrame df5
len(df5.location.unique())  
# Calculate the total number of unique locations in the 'location' column of DataFrame df5 after transformation

241

In [68]:
df5.head(10) #locations converted to other

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0
5,Whitefield,2 BHK,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,4 BHK,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,4 BHK,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,3 BHK,1310.0,3.0,63.25,3,4828.244275
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804


Current working directory: C:\Users\pugal
