## 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
pd.set_option('display.max_columns', None)

## 2. Reading the Data

In [None]:
DATA_DIR = 'data/raw'

def get_data(name):
    data_path = Path(DATA_DIR) / name
    return pd.read_csv(data_path)

In [42]:
noida = get_data('noidaRawExtractedPropertyDetails.csv')
greaterNoida = get_data('greaternoidaRawExtractedPropertyDetails.csv')
delhi = get_data('delhiRawExtractedPropertyDetails.csv')
gurgaon = get_data('gurgaonRawExtractedPropertyDetails.csv')

All the datasets have the same number of columns, and they are from different regions. So, to create a master DataFrame, I need to add a new column that contains the names of the regions and then concatenate all the datasets into one

In [44]:
noida['region'] = 'noida'
greaterNoida['region'] = 'greaterNoida'
delhi['region'] = 'delhi'
gurgaon['region'] = 'gurgaon'

In [45]:
df = pd.concat([noida, greaterNoida, delhi, gurgaon], ignore_index=True)


In [51]:
df.head()

Unnamed: 0,localityName,landMarks,locality,price,nameOfSociety,projectName,carpetArea,coveredArea,coveredAreaUnit,carpetAreaSqft,possessionStatus,floorNumber,totalFloorNumber,longitude,latitude,transactionType,facing,ownershipType,furnished,bedrooms,bathrooms,numberOfBalconied,propertyType,additionalRooms,ageofcons,isVerified,listingTypeDesc,propertyAmenities,facilitiesDesc,propertyId,url,region
0,"Sector 108, Noida Express Way",well developed area with all facilities near s...,Sector 108,17325000.0,Parx Laureate,Parx Laureate,3080.0,3850,Sq-ft,5625.0,Ready to Move,7.0,18.0,77.3766,28.526756,Resale,North - East,Leasehold,Furnished,4,4,,Multistorey Apartment,"Puja Room, Store, Servant Room",,N,Standard Listing,"{'12201': 'Power Back Up', '12202': 'Lift', '1...","Power Back Up, Lift, Rain Water Harvesting, Cl...",10446116,https://www.magicbricks.com/propertyDetails/4-...,noida
1,Sector 76,450 Meters from Metro Station.Facing Main Road,Sector 76,15000000.0,Amrapali Silicon City,Amrapali Silicon City,1180.0,1180,Sq-ft,12712.0,Ready to Move,9.0,13.0,77.3775,28.568478,New Property,East,Freehold,Furnished,2,2,2.0,Multistorey Apartment,Study,Less than 5 years,O,Standard,"{'12202': 'Lift', '12203': 'Rain Water Harvest...","Lift, Rain Water Harvesting, Club House, Park,...",10714088,https://www.magicbricks.com/propertyDetails/2-...,noida
2,Sector 118,,Sector 118,3950000.0,,,,1005,Sq-ft,,Ready to Move,,,77.404719,28.582315,Resale,,Leasehold,Semi-Furnished,2,2,,Multistorey Apartment,,,O,Standard Listing,,,11361681,https://www.magicbricks.com/propertyDetails/2-...,noida
3,Sector 29,Near Brahmaputra Market.,Sector 29,25000000.0,Brahmaputra Apartment,Brahmaputra Apartment,1500.0,1800,Sq-ft,16667.0,Ready to Move,1.0,3.0,77.332412,28.569737,Resale,North - East,Leasehold,Unfurnished,3,3,2.0,Multistorey Apartment,"Puja Room, Study, Store, Servant Room",15 to 20 years,N,Prime Listings,"{'12203': 'Rain Water Harvesting', '12204': 'C...","Rain Water Harvesting, Club House, Swimming Po...",11376982,https://www.magicbricks.com/propertyDetails/3-...,noida
4,"Sector 49, Dadri Road",,Sector 49,4300000.0,Project Hindon Vihar,Project Hindon Vihar,,1050,Sq-ft,,Ready to Move,1.0,4.0,77.37446,28.560053,Resale,North - East,Freehold,Unfurnished,3,3,1.0,Builder Floor Apartment,Puja Room,,O,Standard Listing,"{'12213': 'Vaastu Compliant', '12215': 'Air Co...","Vaastu Compliant, Air Conditioned, Visitor Par...",12494130,https://www.magicbricks.com/propertyDetails/3-...,noida


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59669 entries, 0 to 59668
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   localityName       58175 non-null  object 
 1   landMarks          35577 non-null  object 
 2   locality           58288 non-null  object 
 3   price              58649 non-null  float64
 4   nameOfSociety      37269 non-null  object 
 5   projectName        37241 non-null  object 
 6   carpetArea         35974 non-null  object 
 7   coveredArea        57626 non-null  object 
 8   coveredAreaUnit    57629 non-null  object 
 9   carpetAreaSqft     35969 non-null  object 
 10  possessionStatus   49254 non-null  object 
 11  floorNumber        48814 non-null  object 
 12  totalFloorNumber   53205 non-null  object 
 13  longitude          58296 non-null  float64
 14  latitude           58296 non-null  float64
 15  transactionType    59652 non-null  object 
 16  facing             406

- The dataset contains 59669 rows and 31 columns.
- Plenty of columns have missing values.
- The data types of some features isn't appropriate

## 3. Preliminary Analysis

### 3.1 Check data types

In [4]:
df.dtypes

localityName          object
landMarks             object
locality              object
price                float64
nameOfSociety         object
projectName           object
carpetArea            object
coveredArea           object
coveredAreaUnit       object
carpetAreaSqft        object
possessionStatus      object
floorNumber           object
totalFloorNumber      object
longitude            float64
latitude             float64
transactionType       object
facing                object
ownershipType         object
furnished             object
bedrooms              object
bathrooms             object
numberOfBalconied     object
propertyType          object
additionalRooms       object
ageofcons             object
isVerified            object
listingTypeDesc       object
propertyAmenities     object
facilitiesDesc        object
propertyId             int64
url                   object
region                object
dtype: object

-  Some columns such as `carpetArea, coveredArea, carpetAreaSqft, floorNumber, totalFloorNumber, bedrooms, bathrooms, and numberOfBalconied` have an object data type. They need to be converted to an integer data type

### 3.2 Check for Duplicates

In [5]:
df.duplicated().sum() # There are no duplicates.

np.int64(0)

## 4. Detailed Analysis

In [6]:
df.locality.nunique()

2487

In [11]:
df.head()

Unnamed: 0,localityName,landMarks,locality,price,nameOfSociety,projectName,carpetArea,coveredArea,coveredAreaUnit,carpetAreaSqft,possessionStatus,floorNumber,totalFloorNumber,longitude,latitude,transactionType,facing,ownershipType,furnished,bedrooms,bathrooms,numberOfBalconied,propertyType,additionalRooms,ageofcons,isVerified,listingTypeDesc,propertyAmenities,facilitiesDesc,propertyId,url,region
0,"Sector 108, Noida Express Way",well developed area with all facilities near s...,Sector 108,17325000.0,Parx Laureate,Parx Laureate,3080.0,3850,Sq-ft,5625.0,Ready to Move,7.0,18.0,77.3766,28.526756,Resale,North - East,Leasehold,Furnished,4,4,,Multistorey Apartment,"Puja Room, Store, Servant Room",,N,Standard Listing,"{'12201': 'Power Back Up', '12202': 'Lift', '1...","Power Back Up, Lift, Rain Water Harvesting, Cl...",10446116,https://www.magicbricks.com/propertyDetails/4-...,noida
1,Sector 76,450 Meters from Metro Station.Facing Main Road,Sector 76,15000000.0,Amrapali Silicon City,Amrapali Silicon City,1180.0,1180,Sq-ft,12712.0,Ready to Move,9.0,13.0,77.3775,28.568478,New Property,East,Freehold,Furnished,2,2,2.0,Multistorey Apartment,Study,Less than 5 years,O,Standard,"{'12202': 'Lift', '12203': 'Rain Water Harvest...","Lift, Rain Water Harvesting, Club House, Park,...",10714088,https://www.magicbricks.com/propertyDetails/2-...,noida
2,Sector 118,,Sector 118,3950000.0,,,,1005,Sq-ft,,Ready to Move,,,77.404719,28.582315,Resale,,Leasehold,Semi-Furnished,2,2,,Multistorey Apartment,,,O,Standard Listing,,,11361681,https://www.magicbricks.com/propertyDetails/2-...,noida
3,Sector 29,Near Brahmaputra Market.,Sector 29,25000000.0,Brahmaputra Apartment,Brahmaputra Apartment,1500.0,1800,Sq-ft,16667.0,Ready to Move,1.0,3.0,77.332412,28.569737,Resale,North - East,Leasehold,Unfurnished,3,3,2.0,Multistorey Apartment,"Puja Room, Study, Store, Servant Room",15 to 20 years,N,Prime Listings,"{'12203': 'Rain Water Harvesting', '12204': 'C...","Rain Water Harvesting, Club House, Swimming Po...",11376982,https://www.magicbricks.com/propertyDetails/3-...,noida
4,"Sector 49, Dadri Road",,Sector 49,4300000.0,Project Hindon Vihar,Project Hindon Vihar,,1050,Sq-ft,,Ready to Move,1.0,4.0,77.37446,28.560053,Resale,North - East,Freehold,Unfurnished,3,3,1.0,Builder Floor Apartment,Puja Room,,O,Standard Listing,"{'12213': 'Vaastu Compliant', '12215': 'Air Co...","Vaastu Compliant, Air Conditioned, Visitor Par...",12494130,https://www.magicbricks.com/propertyDetails/3-...,noida


In [18]:
df.localityName.nunique()

2548

In [20]:
(
    df.localityName
    .str.lower()
    .str.strip()
)

0        sector 108, noida express way
1                            sector 76
2                           sector 118
3                            sector 29
4                sector 49, dadri road
                     ...              
59664                        sector 49
59665                      phase 5 dlf
59666                 golf course road
59667                       sohna road
59668                        sector 14
Name: localityName, Length: 59669, dtype: object

### landMarks

In [21]:
df.landMarks.nunique()

20196

### locality

In [22]:
(
    df.locality
    .str.lower()
    .str.strip()
)

0              sector 108
1               sector 76
2              sector 118
3               sector 29
4               sector 49
               ...       
59664           sector 49
59665         dlf phase 5
59666    golf course road
59667          sohna road
59668           sector 14
Name: locality, Length: 59669, dtype: object

### price
-  convert into crores to make it more readable.

In [26]:
(
    df.price
    .apply(lambda x: round(int(x)/10000000,2) if not np.isnan(x) else np.nan)
)

0        1.73
1        1.50
2        0.40
3        2.50
4        0.43
         ... 
59664    0.54
59665     NaN
59666    8.00
59667    7.60
59668    0.65
Name: price, Length: 59669, dtype: float64

### nameOfSociety

In [28]:
df.nameOfSociety.nunique()

5058

In [29]:
df.nameOfSociety

0                Parx Laureate
1        Amrapali Silicon City
2                          NaN
3        Brahmaputra Apartment
4         Project Hindon Vihar
                 ...          
59664              Vatika City
59665                      NaN
59666                      NaN
59667                      NaN
59668                      NaN
Name: nameOfSociety, Length: 59669, dtype: object

### projectName

In [31]:
df.projectName.nunique()

3311

### carpetArea

In [9]:
df.carpetArea.loc[df.carpetArea.astype(str).str.contains(',', na=False)]

1796     11,000
1849     10,000
1852     11,000
1896     10,000
1903     10,000
          ...  
53738    10,000
54340    16,000
54936    10,200
55608    47,017
58403    17,550
Name: carpetArea, Length: 72, dtype: object

In [13]:
(
    df.carpetArea
    .str.replace(',', '')
    .astype(int, errors='ignore')
)

0        3080
1        1180
2         NaN
3        1500
4         NaN
         ... 
59664     NaN
59665     NaN
59666     NaN
59667     NaN
59668     NaN
Name: carpetArea, Length: 59669, dtype: object

### coveredArea
- `coveredArea` is actually super builtup area. So, it needs to be renamed super builtup area.


In [15]:
df.coveredArea.loc[df.coveredArea.astype(str).str.contains(',', na=False)]

618      13,500
699      13,500
930      10,500
1467     18,883
1493     10,218
          ...  
54936    12,000
56314    20,000
56769    22,000
58031    15,720
59019    10,000
Name: coveredArea, Length: 139, dtype: object

In [16]:
(
    df.coveredArea
    .str.replace(',', '')
    .astype(int, errors='ignore')
)

0        3850
1        1180
2        1005
3        1800
4        1050
         ... 
59664    1714
59665    2810
59666    7000
59667    6869
59668    1255
Name: coveredArea, Length: 59669, dtype: object

### coveredAreaUnit
- `'Biswa2', 'Acre', 'Bigha', 'Hectare', 'Rood', 'Ground', 'Biswa1', 'Marla'` need to be dropped because their scales are insignificant.
- `'Sq-m', 'Sq-yrd'` need to be converted into `Sq-ft` to maintain consistent scale.

In [18]:
df.coveredAreaUnit.unique()

array(['Sq-ft', 'Sq-m', 'Sq-yrd', nan, 'Biswa2', 'Acre', 'Bigha',
       'Hectare', 'Rood', 'Ground', 'Biswa1', 'Marla'], dtype=object)

In [37]:
df.coveredAreaUnit.loc[lambda ser: ser.str.contains('Bigha', na=False)]

18181    Bigha
44896    Bigha
Name: coveredAreaUnit, dtype: object

### possessionStatus

In [48]:
(
    df.possessionStatus
    .str.lower()
    .str.strip()
)

0        ready to move
1        ready to move
2        ready to move
3        ready to move
4        ready to move
             ...      
59664              NaN
59665    ready to move
59666    ready to move
59667              NaN
59668    ready to move
Name: possessionStatus, Length: 59669, dtype: object

### floorNumber

In [None]:
(
  df.floorNumber.astype(str)  
)


0          7
1          9
2        nan
3          1
4          1
        ... 
59664      8
59665    nan
59666    nan
59667    nan
59668      1
Name: floorNumber, Length: 59669, dtype: object

### totalFloorNumber


In [63]:
(
    pd.to_numeric(df.totalFloorNumber, errors='coerce')
)

0        18.0
1        13.0
2         NaN
3         3.0
4         4.0
         ... 
59664    14.0
59665     NaN
59666     NaN
59667     NaN
59668     3.0
Name: totalFloorNumber, Length: 59669, dtype: float64

### transactionType

In [53]:
(
    df.transactionType
    .str.lower()
    .str.strip()
)

0              resale
1        new property
2              resale
3              resale
4              resale
             ...     
59664    new property
59665          resale
59666          resale
59667          resale
59668          resale
Name: transactionType, Length: 59669, dtype: object

### facing

In [54]:
(
    df.facing
    .str.lower()
    .str.strip()
)

0        north - east
1                east
2                 NaN
3        north - east
4        north - east
             ...     
59664            east
59665            east
59666             NaN
59667             NaN
59668            east
Name: facing, Length: 59669, dtype: object

### ownershipType

In [57]:
(
    df.ownershipType
    .str.lower()
    .str.strip()
)

0        leasehold
1         freehold
2        leasehold
3        leasehold
4         freehold
           ...    
59664     freehold
59665     freehold
59666     freehold
59667     freehold
59668     freehold
Name: ownershipType, Length: 59669, dtype: object

### furnished

In [59]:
(
    df.furnished
    .str.lower()
    .str.strip()
)

0             furnished
1             furnished
2        semi-furnished
3           unfurnished
4           unfurnished
              ...      
59664    semi-furnished
59665         furnished
59666    semi-furnished
59667       unfurnished
59668    semi-furnished
Name: furnished, Length: 59669, dtype: object

### bedrooms

In [65]:
(
   pd.to_numeric(df.bedrooms, errors='coerce')
)

0        4.0
1        2.0
2        2.0
3        3.0
4        3.0
        ... 
59664    3.0
59665    5.0
59666    5.0
59667    5.0
59668    2.0
Name: bedrooms, Length: 59669, dtype: float64

### bathrooms

In [66]:
(
   pd.to_numeric(df.bathrooms, errors='coerce')
)

0        4.0
1        2.0
2        2.0
3        3.0
4        3.0
        ... 
59664    2.0
59665    5.0
59666    5.0
59667    7.0
59668    3.0
Name: bathrooms, Length: 59669, dtype: float64

### numberOfBalconied

In [68]:
(
   pd.to_numeric(df.numberOfBalconied, errors='coerce')
)

0        NaN
1        2.0
2        NaN
3        2.0
4        1.0
        ... 
59664    2.0
59665    NaN
59666    NaN
59667    NaN
59668    2.0
Name: numberOfBalconied, Length: 59669, dtype: float64

### propertyType

In [69]:
(
    df.propertyType
    .str.lower()
    .str.strip()
)

0          multistorey apartment
1          multistorey apartment
2          multistorey apartment
3          multistorey apartment
4        builder floor apartment
                  ...           
59664      multistorey apartment
59665      multistorey apartment
59666      multistorey apartment
59667          residential house
59668    builder floor apartment
Name: propertyType, Length: 59669, dtype: object

### additionalRooms

In [72]:
(
    df.additionalRooms
    .str.lower()
    .str.strip()
)

0               puja room, store, servant room
1                                        study
2                                          NaN
3        puja room, study, store, servant room
4                                    puja room
                         ...                  
59664                                      NaN
59665                             servant room
59666                                      NaN
59667                             servant room
59668                                      NaN
Name: additionalRooms, Length: 59669, dtype: object

### ageofcons

In [8]:
(
    df['ageofcons']
    .str.lower()
)

0                       NaN
1         less than 5 years
2                       NaN
3            15 to 20 years
4                       NaN
                ...        
59664    under construction
59665                   NaN
59666                   NaN
59667    under construction
59668                   NaN
Name: ageofcons, Length: 59669, dtype: object

In [14]:
df['listingTypeDesc'].value_counts()

listingTypeDesc
Prime Listings           17639
Premium                  16833
Standard Listing          7457
Standard                  7452
Certified Listing         7022
Titanium                  2901
Platinum                   330
Premium Plus Listings       32
Premium Listing              3
Name: count, dtype: int64

In [15]:
df.head()

Unnamed: 0,localityName,landMarks,locality,price,nameOfSociety,projectName,carpetArea,coveredArea,coveredAreaUnit,carpetAreaSqft,possessionStatus,floorNumber,totalFloorNumber,longitude,latitude,transactionType,facing,ownershipType,furnished,bedrooms,bathrooms,numberOfBalconied,propertyType,additionalRooms,ageofcons,isVerified,listingTypeDesc,propertyAmenities,facilitiesDesc,propertyId,url,region
0,"Sector 108, Noida Express Way",well developed area with all facilities near s...,Sector 108,17325000.0,Parx Laureate,Parx Laureate,3080.0,3850,Sq-ft,5625.0,Ready to Move,7.0,18.0,77.3766,28.526756,Resale,North - East,Leasehold,Furnished,4,4,,Multistorey Apartment,"Puja Room, Store, Servant Room",,N,Standard Listing,"{'12201': 'Power Back Up', '12202': 'Lift', '1...","Power Back Up, Lift, Rain Water Harvesting, Cl...",10446116,https://www.magicbricks.com/propertyDetails/4-...,noida
1,Sector 76,450 Meters from Metro Station.Facing Main Road,Sector 76,15000000.0,Amrapali Silicon City,Amrapali Silicon City,1180.0,1180,Sq-ft,12712.0,Ready to Move,9.0,13.0,77.3775,28.568478,New Property,East,Freehold,Furnished,2,2,2.0,Multistorey Apartment,Study,Less than 5 years,O,Standard,"{'12202': 'Lift', '12203': 'Rain Water Harvest...","Lift, Rain Water Harvesting, Club House, Park,...",10714088,https://www.magicbricks.com/propertyDetails/2-...,noida
2,Sector 118,,Sector 118,3950000.0,,,,1005,Sq-ft,,Ready to Move,,,77.404719,28.582315,Resale,,Leasehold,Semi-Furnished,2,2,,Multistorey Apartment,,,O,Standard Listing,,,11361681,https://www.magicbricks.com/propertyDetails/2-...,noida
3,Sector 29,Near Brahmaputra Market.,Sector 29,25000000.0,Brahmaputra Apartment,Brahmaputra Apartment,1500.0,1800,Sq-ft,16667.0,Ready to Move,1.0,3.0,77.332412,28.569737,Resale,North - East,Leasehold,Unfurnished,3,3,2.0,Multistorey Apartment,"Puja Room, Study, Store, Servant Room",15 to 20 years,N,Prime Listings,"{'12203': 'Rain Water Harvesting', '12204': 'C...","Rain Water Harvesting, Club House, Swimming Po...",11376982,https://www.magicbricks.com/propertyDetails/3-...,noida
4,"Sector 49, Dadri Road",,Sector 49,4300000.0,Project Hindon Vihar,Project Hindon Vihar,,1050,Sq-ft,,Ready to Move,1.0,4.0,77.37446,28.560053,Resale,North - East,Freehold,Unfurnished,3,3,1.0,Builder Floor Apartment,Puja Room,,O,Standard Listing,"{'12213': 'Vaastu Compliant', '12215': 'Air Co...","Vaastu Compliant, Air Conditioned, Visitor Par...",12494130,https://www.magicbricks.com/propertyDetails/3-...,noida


## Cleaning Operations

In [None]:
def convert_to_sqft(coveredArea, coveredAreaUnit):
    if pd.isna(coveredArea):
        return np.nan
    try:
        value = float(str(coveredArea).replace(',', ''))
    except ValueError:
        return np.nan
    if coveredAreaUnit == 'Sq-yrd':
        return value * 9
    elif coveredAreaUnit == 'Sq-m':
        return round(value * 10.7639, 2)
    elif coveredAreaUnit == 'Sq-ft':
        return value
    else:
        return np.nan

def change_scale(coveredAreaUnit):
    if coveredAreaUnit in ['sq-yrd', 'sq-m', 'sq-ft']:
        return 'sq-ft'
    else:
        return np.nan



In [112]:
def clean_data(df):
    return (
        df.rename(columns=str.lower)
        .assign(
            localityname= lambda df_: (df_.localityname
                            .str.lower()
                            .str.strip()
        ),
            landmarks= lambda df_: (df_.landmarks
                            .str.lower()
                            .str.strip()
        ),
            locality= lambda df_: (df_.locality
                            .str.lower()
                            .str.strip()
        ),
            nameofsociety = lambda df_: (df_.nameofsociety
                            .str.lower()
                            .str.strip()
        ),
            price= lambda df_: pd.to_numeric(df_.price, errors='coerce'),
            coveredarea= lambda df_: pd.to_numeric(df_.coveredarea, errors='coerce'),
            carpetarea= lambda df_: pd.to_numeric(df_.carpetarea, errors='coerce'),
            superbuiltuparea = lambda df_ : df_.apply(
                lambda row: convert_to_sqft(row.coveredarea, row.coveredareaunit),
                axis=1
            ),
             coveredareaunit= lambda df_: (df_.coveredareaunit
                            .str.lower()
                            .str.strip().apply(change_scale)
        ),
             possessionstatus= lambda df_: (df_.possessionstatus
                            .str.lower()
                            .str.strip()
        ),
             floornumber= lambda df_: df_.floornumber.astype(str),
             totalfloornumber= lambda df_: pd.to_numeric(df_.totalfloornumber, errors='coerce'),
             transactiontype= lambda df_: (df_.transactiontype
                            .str.lower()
                            .str.strip()
        ),
             facing= lambda df_: (df_.facing
                            .str.lower()
                            .str.strip()
        ),  
             ownershiptype= lambda df_: (df_.ownershiptype
                            .str.lower()
                            .str.strip()
        ),
             furnished = lambda df_: (df_.furnished
                            .str.lower()
                            .str.strip()
        ),
             bedrooms= lambda df_: pd.to_numeric(df_.bedrooms, errors='coerce'),
             bathrooms= lambda df_: pd.to_numeric(df_.bathrooms, errors='coerce'),
             numberofbalconied= lambda df_: pd.to_numeric(df_.numberofbalconied, errors='coerce'),
             propertytype= lambda df_: (df_.propertytype
                            .str.lower()
                            .str.strip()
        ),
             additionalrooms= lambda df_: (df_.additionalrooms
                            .str.lower()
                            .str.strip()
        ),
             ageofcons = lambda df_: (df_.ageofcons
                            .str.lower()
                            .str.strip()
        ),
        
    ).drop(columns=['carpetarea','carpetareasqft', 'isverified', 'listingtypedesc'])
     .pipe(lambda df_: df_.loc[df_.superbuiltuparea > 450])
     .dropna(subset=['price'])
     .reset_index(drop=True)
     
    )
    
    


In [113]:
cleaned_df = clean_data(df)

In [114]:
cleaned_df.shape

(54956, 29)

In [None]:
cleaned_df.to_csv('data/raw/cleaned_dataV1.csv', index=False)

In [116]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54956 entries, 0 to 54955
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   localityname       53710 non-null  object 
 1   landmarks          33771 non-null  object 
 2   locality           53807 non-null  object 
 3   price              54956 non-null  float64
 4   nameofsociety      35511 non-null  object 
 5   projectname        35486 non-null  object 
 6   coveredarea        54956 non-null  float64
 7   coveredareaunit    54956 non-null  object 
 8   possessionstatus   45203 non-null  object 
 9   floornumber        54956 non-null  object 
 10  totalfloornumber   48917 non-null  float64
 11  longitude          53819 non-null  float64
 12  latitude           53819 non-null  float64
 13  transactiontype    54952 non-null  object 
 14  facing             38339 non-null  object 
 15  ownershiptype      39716 non-null  object 
 16  furnished          541