## 1. Importing Libraries

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)

## 2. Reading the Data

In [3]:
PROJECT_DIR = Path(r"F:\Rishabh\House-Price-Prediction-MLOps-Project")
DATA_DIR = "data"

In [7]:
def read_data(name):
    file_name = f"{name}.csv"
    file_path = PROJECT_DIR/DATA_DIR/file_name

    return pd.read_csv(file_path)

In [9]:
houses = read_data("houses")

In [11]:
houses

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,Furnishing,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
0,0,1 BHK Ready to Occupy Flat for sale in Srushti...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,Unfurnished,,,Srushti Siddhi Mangal Murti Complex,1,2,,,,,
1,1,2 BHK Ready to Occupy Flat for sale in Dosti V...,One can find this stunning 2 BHK flat for sale...,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,Semi-Furnished,East,Garden/Park,Dosti Vihar,2,,1 Open,Freehold,,,
2,2,2 BHK Ready to Occupy Flat for sale in Sunrise...,Up for immediate sale is a 2 BHK apartment in ...,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,Unfurnished,East,Garden/Park,Sunrise by Kalpataru,2,,1 Covered,Freehold,,,
3,3,1 BHK Ready to Occupy Flat for sale Kasheli,This beautiful 1 BHK Flat is available for sal...,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,Unfurnished,,,,1,1,,,,,
4,4,2 BHK Ready to Occupy Flat for sale in TenX Ha...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,Unfurnished,West,"Garden/Park, Main Road",TenX Habitat Raymond Realty,2,,1 Covered,Co-operative Society,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187526,187526,3 BHK Ready to Occupy Flat for sale in Bollywo...,This magnificent 3 BHK Flat is available for s...,63 Lac,3225.0,zirakpur,,Ready to Move,2 out of 4,New Property,Semi-Furnished,East,Garden/Park,Bollywood Esencia,3,3,1 Covered,Freehold,1953 sqft,,
187527,187527,3 BHK Ready to Occupy Flat for sale in Sushma ...,Have a look at this immaculate 3 BHK flat for ...,55 Lac,3274.0,zirakpur,,Ready to Move,4 out of 6,Resale,Unfurnished,North - East,"Garden/Park, Main Road",Sushma Urban Views,3,,1 Covered,,1680 sqft,,
187528,187528,3 BHK Ready to Occupy Flat for sale in Bollywo...,"Gazipur, Zirakpur has an appealing 3 BHK flat ...",76 Lac,4343.0,zirakpur,1250 sqft,Ready to Move,1 out of 3,Resale,Furnished,East,"Garden/Park, Main Road",Bollywood Esencia,3,2,"1 Covered,",Freehold,,,
187529,187529,2 BHK Ready to Occupy Flat for sale in Friends...,Up for immediate sale is a 2 BHK apartment in ...,30 Lac,4231.0,zirakpur,,Ready to Move,2 out of 2,Resale,Semi-Furnished,,Main Road,Friends Enclave,2,,,,709 sqft,,


## 3. Preliminary Analysis

In [13]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Index              187531 non-null  int64  
 1   Title              187531 non-null  object 
 2   Description        184508 non-null  object 
 3   Amount(in rupees)  187531 non-null  object 
 4   Price (in rupees)  169866 non-null  float64
 5   location           187531 non-null  object 
 6   Carpet Area        106858 non-null  object 
 7   Status             186916 non-null  object 
 8   Floor              180454 non-null  object 
 9   Transaction        187448 non-null  object 
 10  Furnishing         184634 non-null  object 
 11  facing             117298 non-null  object 
 12  overlooking        106095 non-null  object 
 13  Society            77853 non-null   object 
 14  Bathroom           186703 non-null  object 
 15  Balcony            138596 non-null  object 
 16  Ca

### 3.1 Checking for data types

In [15]:
houses.dtypes

Index                  int64
Title                 object
Description           object
Amount(in rupees)     object
Price (in rupees)    float64
location              object
Carpet Area           object
Status                object
Floor                 object
Transaction           object
Furnishing            object
facing                object
overlooking           object
Society               object
Bathroom              object
Balcony               object
Car Parking           object
Ownership             object
Super Area            object
Dimensions           float64
Plot Area            float64
dtype: object

**Observations**  
- data types of some columns are not appropriate
- there are missing values in many columns

### 3.2 Checking duplicates

In [17]:
(
    houses
    .duplicated()
    .sum()
)

0

- there are no duplicate rows in this dataset

## 4. Detailed Analysis

### 4.1 `Title`

In [19]:
houses.Title

0         1 BHK Ready to Occupy Flat for sale in Srushti...
1         2 BHK Ready to Occupy Flat for sale in Dosti V...
2         2 BHK Ready to Occupy Flat for sale in Sunrise...
3               1 BHK Ready to Occupy Flat for sale Kasheli
4         2 BHK Ready to Occupy Flat for sale in TenX Ha...
                                ...                        
187526    3 BHK Ready to Occupy Flat for sale in Bollywo...
187527    3 BHK Ready to Occupy Flat for sale in Sushma ...
187528    3 BHK Ready to Occupy Flat for sale in Bollywo...
187529    2 BHK Ready to Occupy Flat for sale in Friends...
187530    3 BHK Ready to Occupy Flat for sale in Affinit...
Name: Title, Length: 187531, dtype: object

In [21]:
(
    houses
    .Title
    .isnull()
    .sum()
)

0

In [23]:
(
    houses
    .Title
    .loc[lambda ser: ~ser.str.contains("BHK")]
)

115                 Studio Apartment for sale Dombivli East
149                  Studio Apartment for sale Vartak Nagar
150        Studio Apartment for sale in Lok Nagari Phase...
278                      Studio Apartment for sale Asangaon
293                          Apartment for sale Kisan Nagar
                                ...                        
186480     Studio Apartment for sale in Royale High Stre...
187052                    Studio Apartment for sale Gazipur
187182     Studio Apartment for sale in Royale High Stre...
187377                    Studio Apartment for sale Dhakoli
187466                   Studio Apartment for sale VIP Road
Name: Title, Length: 921, dtype: object

In [12]:
(
    houses
    .Title
    .loc[lambda ser: ~ser.str.contains("Flat")]
)

115                 Studio Apartment for sale Dombivli East
149                  Studio Apartment for sale Vartak Nagar
150        Studio Apartment for sale in Lok Nagari Phase...
278                      Studio Apartment for sale Asangaon
293                          Apartment for sale Kisan Nagar
                                ...                        
186480     Studio Apartment for sale in Royale High Stre...
187052                    Studio Apartment for sale Gazipur
187182     Studio Apartment for sale in Royale High Stre...
187377                    Studio Apartment for sale Dhakoli
187466                   Studio Apartment for sale VIP Road
Name: Title, Length: 921, dtype: object

### 4.2 `Description`

In [13]:
houses.Description

0         Bhiwandi, Thane has an attractive 1 BHK Flat f...
1         One can find this stunning 2 BHK flat for sale...
2         Up for immediate sale is a 2 BHK apartment in ...
3         This beautiful 1 BHK Flat is available for sal...
4         This lovely 2 BHK Flat in Pokhran Road, Thane ...
                                ...                        
187526    This magnificent 3 BHK Flat is available for s...
187527    Have a look at this immaculate 3 BHK flat for ...
187528    Gazipur, Zirakpur has an appealing 3 BHK flat ...
187529    Up for immediate sale is a 2 BHK apartment in ...
187530    This exquisite 3 BHK Flat is offered for sale ...
Name: Description, Length: 187531, dtype: object

- this column should be dropped

### 4.3 `Amount(in rupees)`

In [14]:
houses["Amount(in rupees)"] 

0          42 Lac 
1          98 Lac 
2         1.40 Cr 
3          25 Lac 
4         1.60 Cr 
            ...   
187526     63 Lac 
187527     55 Lac 
187528     76 Lac 
187529     30 Lac 
187530    1.18 Cr 
Name: Amount(in rupees), Length: 187531, dtype: object

In [15]:
(
    houses["Amount(in rupees)"]
    .loc[lambda ser: ~ser.str.contains("Lac")]
    .loc[lambda ser: ~ser.str.contains("Cr")]
)

24        Call for Price
37        Call for Price
83        Call for Price
131       Call for Price
177       Call for Price
               ...      
187404    Call for Price
187442    Call for Price
187503    Call for Price
187506    Call for Price
187513    Call for Price
Name: Amount(in rupees), Length: 9684, dtype: object

- There are 9684 rows which has no price so these has to be dropped

### 4.4 `Carpet Area`

In [16]:
houses["Carpet Area"]

0          500 sqft
1          473 sqft
2          779 sqft
3          530 sqft
4          635 sqft
            ...    
187526          NaN
187527          NaN
187528    1250 sqft
187529          NaN
187530          NaN
Name: Carpet Area, Length: 187531, dtype: object

In [17]:
(
    houses["Carpet Area"]
    .loc[lambda ser: ~ser.isnull()]
    .loc[lambda ser: ~ser.str.contains("sqft")]
    .loc[lambda ser: ~ser.str.contains("sqm")]
    .loc[lambda ser: ~ser.str.contains("sqyrd")]
)

93644      1500 acre
113787        2 acre
147912      3 ground
152383        3 cent
162057    1607 bigha
173555      14 marla
173633       1 kanal
173685      14 marla
173757       1 kanal
173796      10 marla
Name: Carpet Area, dtype: object

- there are so many different units in this column
- all the units should be the same

### 4.5 `Status`

In [18]:
houses.Status

0         Ready to Move
1         Ready to Move
2         Ready to Move
3         Ready to Move
4         Ready to Move
              ...      
187526    Ready to Move
187527    Ready to Move
187528    Ready to Move
187529    Ready to Move
187530    Ready to Move
Name: Status, Length: 187531, dtype: object

In [19]:
(
    houses
    .Status
    .value_counts()
)

Status
Ready to Move    186916
Name: count, dtype: int64

- this column has only 1 category and it has to be dropped

### 4.6 `Floor`

In [20]:
houses.Floor

0         10 out of 11
1          3 out of 22
2         10 out of 29
3           1 out of 3
4         20 out of 42
              ...     
187526      2 out of 4
187527      4 out of 6
187528      1 out of 3
187529      2 out of 2
187530     5 out of 13
Name: Floor, Length: 187531, dtype: object

In [21]:
(
    houses
    .Floor
    .loc[lambda ser: ~ser.isnull()]
    .loc[lambda ser: ~ser.str.contains("out of")]
)

1295                   2
4584                   2
4890              Ground
16292                  4
40127                  1
40225                  4
41375                  3
51034             Ground
71650                  4
85054                  3
91040             Ground
92171                  2
115431                 2
118002                 2
144943                 3
148741                 6
150784                 2
151091                 6
152995                 2
155281                13
156322                 3
159037                 2
159038                 1
159039                 2
159040                 2
159041                 1
159042                 2
159047                 1
159048                 1
159049                 2
160226                 3
160888                12
161281                 2
162016                20
166332                 5
166428            Ground
168325                 3
169890                 1
173153                 4
174757                 3


- there are a lot of mixed values in this column

### 4.7 `Transaction`

In [22]:
houses.Transaction

0               Resale
1               Resale
2               Resale
3               Resale
4               Resale
              ...     
187526    New Property
187527          Resale
187528          Resale
187529          Resale
187530          Resale
Name: Transaction, Length: 187531, dtype: object

In [23]:
(
    houses
    .Transaction
    .value_counts()
)

Transaction
Resale          144172
New Property     42565
Other              709
Rent/Lease           2
Name: count, dtype: int64

### 4.8 `Furnishing`

In [24]:
houses.Furnishing

0            Unfurnished
1         Semi-Furnished
2            Unfurnished
3            Unfurnished
4            Unfurnished
               ...      
187526    Semi-Furnished
187527       Unfurnished
187528         Furnished
187529    Semi-Furnished
187530    Semi-Furnished
Name: Furnishing, Length: 187531, dtype: object

In [25]:
(
    houses
    .Furnishing
    .value_counts()
)

Furnishing
Semi-Furnished    88318
Unfurnished       76154
Furnished         20162
Name: count, dtype: int64

### 4.9 `facing`

In [26]:
houses.facing

0                  NaN
1                 East
2                 East
3                  NaN
4                 West
              ...     
187526            East
187527    North - East
187528            East
187529             NaN
187530    North - East
Name: facing, Length: 187531, dtype: object

In [27]:
(
    houses
    .facing
    .value_counts()
)

facing
East            54741
North - East    24220
North           16533
West             8574
South            4694
North - West     3843
South - East     2622
South -West      2071
Name: count, dtype: int64

### 4.10 `overlooking`

In [28]:
houses.overlooking

0                            NaN
1                    Garden/Park
2                    Garden/Park
3                            NaN
4         Garden/Park, Main Road
                   ...          
187526               Garden/Park
187527    Garden/Park, Main Road
187528    Garden/Park, Main Road
187529                 Main Road
187530         Garden/Park, Pool
Name: overlooking, Length: 187531, dtype: object

In [29]:
(
    houses
    .overlooking
    .value_counts()
)

overlooking
Main Road                                      32193
Garden/Park, Main Road                         27238
Garden/Park                                    23077
Garden/Park, Pool, Main Road                   12413
Pool, Garden/Park, Main Road                    3615
Garden/Park, Pool                               2880
Main Road, Garden/Park, Pool                    1359
Pool, Main Road                                 1136
Pool                                            1012
Main Road, Garden/Park                           666
Pool, Garden/Park                                435
Garden/Park, Main Road, Pool                      39
Main Road, Pool                                   11
Main Road, Pool, Garden/Park                       8
Pool, Main Road, Garden/Park                       6
Main Road, Not Available                           4
Garden/Park, Pool, Main Road, Not Available        1
Garden/Park, Not Available                         1
Pool, Main Road, Not Available    

### 4.11 `Society`

In [30]:
houses.Society

0         Srushti Siddhi Mangal Murti Complex
1                                 Dosti Vihar
2                        Sunrise by Kalpataru
3                                         NaN
4                 TenX Habitat Raymond Realty
                         ...                 
187526                      Bollywood Esencia
187527                     Sushma Urban Views
187528                      Bollywood Esencia
187529                        Friends Enclave
187530                        Affinity Greens
Name: Society, Length: 187531, dtype: object

In [31]:
(
    houses
    .Society
    .value_counts()
)

Society
Hamdam Apartment           1648
Malibu Town                1158
Shree Vardhman Victoria    1154
DLF Skycourt               1153
Nebula Tower                982
                           ... 
Kumar Princeville             1
Nyati Equatorial              1
Sentosa Serene                1
Shubhankar Durvaa             1
Dreams Estate                 1
Name: count, Length: 10376, dtype: int64

- this column should be dropped

### 4.12 `Bathroom`

In [32]:
houses.Bathroom

0         1
1         2
2         2
3         1
4         2
         ..
187526    3
187527    3
187528    3
187529    2
187530    4
Name: Bathroom, Length: 187531, dtype: object

In [33]:
(
    houses
    .Bathroom
    .value_counts()
)

Bathroom
2       93007
3       55781
1       18654
4       15600
5        3343
6         209
7          35
> 10       35
10         14
8          14
9          11
Name: count, dtype: int64

### 4.13 `Balcony`

In [34]:
houses.Balcony

0           2
1         NaN
2         NaN
3           1
4         NaN
         ... 
187526      3
187527    NaN
187528      2
187529    NaN
187530      4
Name: Balcony, Length: 187531, dtype: object

In [35]:
(
    houses
    .Balcony
    .value_counts()
)

Balcony
2       51809
1       49219
3       27111
4        9420
5         841
6         132
> 10       22
7          14
10         13
8          13
9           2
Name: count, dtype: int64

### 4.14 `Car Parking`

In [36]:
houses["Car Parking"]

0                NaN
1             1 Open
2          1 Covered
3                NaN
4          1 Covered
             ...    
187526     1 Covered
187527     1 Covered
187528    1 Covered,
187529           NaN
187530     1 Covered
Name: Car Parking, Length: 187531, dtype: object

In [37]:
(
    houses["Car Parking"]
    .value_counts()
)

Car Parking
1 Covered      38754
1 Covered,     16991
2 Covered      10691
1 Open          7873
2 Covered,      3978
               ...  
206 Covered        1
205 Covered        1
11 Covered,        1
403 Covered        1
702 Open           1
Name: count, Length: 229, dtype: int64

In [38]:
(
    houses["Car Parking"]
    .loc[lambda ser: ~ser.isnull()]
    .loc[lambda ser: ~ser.str.contains("Open")]
    .loc[lambda ser: ~ser.str.contains("Covered")]
)

Series([], Name: Car Parking, dtype: object)

### 4.15 `Ownership`

In [39]:
houses.Ownership

0                          NaN
1                     Freehold
2                     Freehold
3                          NaN
4         Co-operative Society
                  ...         
187526                Freehold
187527                     NaN
187528                Freehold
187529                     NaN
187530                Freehold
Name: Ownership, Length: 187531, dtype: object

In [40]:
(
    houses
    .Ownership
    .value_counts()
)

Ownership
Freehold                112229
Leasehold                 5285
Co-operative Society      3431
Power Of Attorney         1069
Name: count, dtype: int64

### 4.16 `Super Area`

In [41]:
houses["Super Area"]

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
187526    1953 sqft
187527    1680 sqft
187528          NaN
187529     709 sqft
187530    1915 sqft
Name: Super Area, Length: 187531, dtype: object

In [42]:
(
    houses["Super Area"]
    .loc[lambda ser: ~ser.isnull()]
    .loc[lambda ser: ~ser.str.contains("sqft")]
    .loc[lambda ser: ~ser.str.contains("sqyrd")]
    .loc[lambda ser: ~ser.str.contains("sqm")]
)

15415         1810 marla
150826           8 marla
150891           4 marla
150926           5 marla
151023           4 kanal
151052           8 marla
151347           8 marla
154245        700 ground
170988           8 marla
171139           5 marla
171914       1970 biswa2
173411      485 aankadam
173752           2 kanal
175044          998 acre
176805        585 ground
177459        1080 kanal
181327       360 hectare
182603        2800 kanal
183765            2 cent
185086    530,040 ground
Name: Super Area, dtype: object

- there are too many units in this column
- all the units should be the same

## 5. Cleaning Operations

In [25]:
def clean_data(data):
        # convert the values to crores scale
        
        def convert_to_crores(ser):
            return(
                    ser
                    .str.split(" ",expand = True)
                    .set_axis(["amount","unit"],axis=1)
                    .assign(
                        amount = lambda df:(
                            np.where(
                                df.unit.eq("Lac"),
                                df.amount.astype(float).mul(0.01),
                                df.amount.astype(float)
                            )
                        )
                    )
                    .amount
            )
        
        # all the units converted to sqft
        
        conversion_factors = {
            "sqft": 1,
            "sqyrd": 9,
            "sqm": 10.7639,
            "marla": 272.25,
            "kanal": 5445,
            "ground": 2400,
            "biswa2": 1350,
            "aankadam": 75,
            "acre": 43560,
            "hectare": 107639,
            "cent": 435.6,
            "bigha": 27225
        }
        
        def remove_area_units_and_standardize(ser):
            return(
                ser
                .str.replace(",","")
                .str.split(" ",expand = True)
                .set_axis(["value","unit"],axis=1)
                .assign(
                    value = lambda df:(
                        pd.to_numeric(df.value)*df.unit.map(conversion_factors)
                    )
                )
                .value
            )
        
        return(
             data
            .assign(**{
                col: data[col].str.strip()
                for col in data.select_dtypes(include = "O").columns
            })
            .rename(columns = lambda col: col.lower().replace(" ","_").split("_(")[0].split("(")[0])
            .drop(columns = ["index","description","status","society","dimensions","plot_area"])
            .query('amount != "Call for Price"')
            .assign(
                bathroom = lambda df: pd.to_numeric(df.bathroom.str.replace("> ","")),
                balcony = lambda df: pd.to_numeric(df.balcony.str.replace("> ","")),
                amount = lambda df: df.amount.pipe(convert_to_crores),
                carpet_area = lambda df: df.carpet_area.pipe(remove_area_units_and_standardize),
                super_area = lambda df: df.super_area.pipe(remove_area_units_and_standardize),
                floor = lambda df: df.floor.str.replace("200 out of 200","2 out of 2"),
                car_parking = lambda df: df.car_parking.str.replace(",",""),
                num_bhk = lambda df: (
                    pd.to_numeric(
                        np.where(
                            df.title.str.contains("BHK"),
                            df.title.str.split("BHK").str[0].str.replace(">","").str.strip(),
                            np.nan
                    )
                  )
                ),
               is_studio = lambda df: np.where(df.title.str.contains('Studio'),1,0),
               floor_num = lambda df: (
                   np.where(
                           df.floor.str.contains("out of"),
                           pd.to_numeric(
                               df.floor
                               .str.split("out of")
                               .str[0]
                               .str.replace("Ground","0")
                               .str.replace("Lower Basement","0")
                               .str.replace("Upper Basement","0")
                           ),
                           np.where(
                               df.floor.isnull(),
                               np.nan,
                               df.floor
                               .str.split("out of")
                               .str[0]
                               .str.replace("Ground","0")
                               .str.replace("Lower Basement","0")
                               .str.replace("Upper Basement","0")
                           )
                   )
               ),
               num_floors = lambda df: (
                   np.where(
                       df.floor.str.contains("out of"),
                       pd.to_numeric(
                           df.floor
                           .str.split("out of")
                           .str[1]
                       ),
                       np.nan
                   )
               ),
               overlooking_garden = lambda df: (
                   np.where(
                       df.overlooking.isnull(),
                       np.nan,
                       np.where(df.overlooking.str.contains("Garden"),True,False)
                   )
               ),
               overlooking_mainroad = lambda df: (
                   np.where(
                       df.overlooking.isnull(),
                       np.nan,
                       np.where(df.overlooking.str.contains("Main Road"),True,False)
                   )
               ),
               overlooking_pool = lambda df: (
                   np.where(
                       df.overlooking.isnull(),
                       np.nan,
                       np.where(df.overlooking.str.contains("Pool"),True,False)
                   )
               ),
               parking_spots = lambda df: (
                   pd.to_numeric(
                       df.car_parking
                       .str.extract(r"(\d+)")[0],
                       errors = 'coerce'
                   )
               ),
               parking_cover = lambda df: (
                       np.where(
                           df.car_parking.isnull(),
                           np.nan,
                           df.car_parking
                           .str.split(" ")
                           .str[1]
                       )
               )
            )
            .assign(
                balcony = lambda df:(
                    np.where(df.floor_num == 0,0,df.balcony)
                )
            )
            .loc[lambda df: (df.carpet_area.between(90,10000)) | (df.super_area.between(100,10000))]
            .loc[lambda df: (df.price.between(200,10000))]
            .loc[lambda df: df.amount.between(0.1,100)]
            .loc[lambda df:
                (
                    df.num_bhk.isnull()
                    | (df.bathroom.isnull() | df.bathroom.lt(df.num_bhk + 2))
                    & (df.balcony.isnull() | df.balcony.lt(df.num_bhk + 2))
                )
            ]
            .drop(columns = ["title","floor","overlooking","car_parking","price"])
            .drop_duplicates()
            .dropna(subset = ["transaction","num_bhk","bathroom"])
        )

In [27]:
houses_cleaned = clean_data(houses)

In [29]:
houses_cleaned

Unnamed: 0,amount,location,carpet_area,transaction,furnishing,facing,bathroom,balcony,ownership,super_area,num_bhk,is_studio,floor_num,num_floors,overlooking_garden,overlooking_mainroad,overlooking_pool,parking_spots,parking_cover
0,0.420,thane,500.0,Resale,Unfurnished,,1.0,2.0,,,1.0,0,10.0,11.0,,,,,
5,0.450,thane,,Resale,Unfurnished,East,1.0,1.0,Co-operative Society,680.0,1.0,0,2.0,7.0,1.0,1.0,0.0,,
6,0.165,thane,550.0,Resale,Unfurnished,,1.0,,,,1.0,0,4.0,5.0,,,,,
8,0.600,thane,,Resale,Furnished,,1.0,0.0,Co-operative Society,600.0,1.0,0,0.0,2.0,,,,,
15,0.900,thane,675.0,New Property,Unfurnished,North - East,2.0,1.0,Freehold,,2.0,0,10.0,16.0,1.0,1.0,0.0,1.0,Covered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187526,0.630,zirakpur,,New Property,Semi-Furnished,East,3.0,3.0,Freehold,1953.0,3.0,0,2.0,4.0,1.0,0.0,0.0,1.0,Covered
187527,0.550,zirakpur,,Resale,Unfurnished,North - East,3.0,,,1680.0,3.0,0,4.0,6.0,1.0,1.0,0.0,1.0,Covered
187528,0.760,zirakpur,1250.0,Resale,Furnished,East,3.0,2.0,Freehold,,3.0,0,1.0,3.0,1.0,1.0,0.0,1.0,Covered
187529,0.300,zirakpur,,Resale,Semi-Furnished,,2.0,,,709.0,2.0,0,2.0,2.0,0.0,1.0,0.0,,


## 6. Splitting the Data

In [35]:
X = houses_cleaned.drop(columns = ["amount"])
y = houses_cleaned.amount.copy()

X_,X_test,y_,y_test = train_test_split(X,y,test_size = 0.1,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_,y_,test_size = 0.1,random_state = 42)

print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(42129, 18)
(5202, 18)
(4681, 18)
(42129,)
(5202,)
(4681,)


## 7. Export The Dataset

In [37]:
def export_data(X,y,name):
    file_name = f"{name}.csv"
    folder_path = PROJECT_DIR/DATA_DIR/name
    file_path = folder_path/file_name

    folder_path.mkdir(parents=True, exist_ok=True)

    X.join(y).to_csv(file_path,index = False)

    return (
        pd.read_csv(file_path).head()
    )

In [39]:
export_data(X_train,y_train,"train")

Unnamed: 0,location,carpet_area,transaction,furnishing,facing,bathroom,balcony,ownership,super_area,num_bhk,is_studio,floor_num,num_floors,overlooking_garden,overlooking_mainroad,overlooking_pool,parking_spots,parking_cover,amount
0,bhiwadi,700.0,Resale,Semi-Furnished,North,2.0,2.0,Leasehold,,2.0,0,5.0,15.0,1.0,1.0,0.0,,,0.11
1,sonipat,1000.0,Resale,Unfurnished,North,2.0,2.0,Freehold,,3.0,0,2.0,3.0,1.0,1.0,1.0,1.0,Open,0.255
2,greater-noida,675.0,Resale,Unfurnished,East,2.0,3.0,Leasehold,,2.0,0,8.0,25.0,1.0,1.0,1.0,,,0.65
3,chennai,,Resale,Semi-Furnished,East,3.0,2.0,Freehold,1585.0,3.0,0,1.0,2.0,,,,,,0.87
4,chandigarh,1200.0,New Property,Semi-Furnished,East,3.0,3.0,Freehold,,3.0,0,6.0,12.0,1.0,1.0,0.0,1.0,Covered,1.37


In [41]:
export_data(X_test,y_test,"test")

Unnamed: 0,location,carpet_area,transaction,furnishing,facing,bathroom,balcony,ownership,super_area,num_bhk,is_studio,floor_num,num_floors,overlooking_garden,overlooking_mainroad,overlooking_pool,parking_spots,parking_cover,amount
0,bangalore,1000.0,Resale,Furnished,South -West,3.0,2.0,Freehold,,3.0,0,10.0,10.0,1.0,1.0,1.0,2.0,Covered,0.78
1,vadodara,,Resale,Unfurnished,,2.0,1.0,Freehold,1550.0,3.0,0,3.0,12.0,1.0,0.0,0.0,,,0.35
2,pune,,New Property,Unfurnished,,1.0,,,560.0,1.0,0,2.0,7.0,,,,,,0.44
3,kolkata,735.0,Resale,Unfurnished,South - East,2.0,1.0,,,2.0,0,3.0,3.0,1.0,0.0,0.0,,,0.56
4,visakhapatnam,1470.0,Resale,Unfurnished,North,3.0,2.0,Freehold,,3.0,0,4.0,5.0,1.0,0.0,0.0,,,0.72


In [43]:
export_data(X_val,y_val,"val")

Unnamed: 0,location,carpet_area,transaction,furnishing,facing,bathroom,balcony,ownership,super_area,num_bhk,is_studio,floor_num,num_floors,overlooking_garden,overlooking_mainroad,overlooking_pool,parking_spots,parking_cover,amount
0,new-delhi,750.0,Resale,Furnished,East,2.0,,Freehold,,2.0,0,1.0,4.0,0.0,1.0,0.0,1.0,Covered,0.37
1,greater-noida,,New Property,Unfurnished,,3.0,,,1880.0,3.0,0,,,,,,,,1.71
2,pune,,Resale,Unfurnished,,1.0,,,800.0,2.0,0,3.0,3.0,,,,,,0.32
3,jamshedpur,,Resale,Semi-Furnished,,2.0,,,1460.0,3.0,0,3.0,4.0,,,,,,0.85
4,zirakpur,,New Property,Semi-Furnished,North - East,3.0,2.0,Freehold,1550.0,3.0,0,3.0,4.0,1.0,1.0,1.0,,,0.56
