In [12]:
import pandas as pd

In [13]:
df = pd.read_csv("details.csv",delimiter=";")

In [14]:
def clean_price(price):
    if price is None or price == 'Zapytaj o cenę':
        return None  # Convert 'Zapytaj o cenę' or None to None

    # Remove the ' zł' suffix, strip spaces, and replace commas with dots
    price = price.replace(' zł', '').replace(' ', '').replace(',', '.')

    try:
        # Convert the string to a float first to handle any decimals correctly
        price = float(price)
        # Convert the float to an integer (this effectively discards any decimal part)
        return int(price)
    except ValueError:
        return None  # Return None if conversion fails


In [15]:
def clean_rent(rent):
    if not isinstance(rent, str):  # Check if the rent is not a string
        return None  # If it's not a string, return None (handles NaN, None, etc.)

    # Remove ' zł' and replace spaces, then try converting to integer
    rent = rent.replace(' zł', '').replace(' ', '')
    try:
        return int(rent)  # Convert the cleaned string to an integer
    except ValueError:
        return None  # Return None if conversion fails

In [16]:
def clean_area(area):
    # Remove ' m²' and replace comma with a dot for float conversion
    if area.endswith(' m²'):
        area = area.replace(' m²', '').replace(',', '.')
    try:
        return float(area)  # Convert to float
    except ValueError:
        return None  # Return None if conversion fails


In [17]:
def extract_district(address):
    # List of allowed districts
    allowed_districts = ["Białołęka", "Bielany", "Bemowo", "Żoliborz", "Wola", "Śródmieście", "Ochota", "Włochy", "Ursus", 
                         "Mokotów", "Ursynów", "Wilanów", "Targówek", "Praga-Północ", "Praga-Południe", "Rembertów", 
                         "Wawer", "Wesoła"]
    parts = address.split(',')
    if len(parts) >= 3:  # Check if the address has enough parts
        district = parts[-3].strip()  # Extract the third last part as the district
        if district in allowed_districts:
            return district
    return None  # Return None if the address format is unexpected or district not allowed

In [18]:
def split_floor(floor_string):
    if not isinstance(floor_string, str):
        return None, None  # Handle non-string input gracefully

    floor_string = floor_string.lower().strip()

    # Handle "parter" followed by a slash and a number, e.g., "parter/4"
    if 'parter' in floor_string:
        parts = floor_string.split('/')
        if len(parts) == 2 and parts[0].strip() == 'parter':
            floor = 0  # Parter is considered as floor 0
            max_floor = int(parts[1].strip()) if parts[1].strip().isdigit() else None
            return floor, max_floor
        return None, None  # If format is not as expected

    # Handle "suterena" followed by a slash and a number
    if 'suterena' in floor_string:
        parts = floor_string.split('/')
        if len(parts) == 2 and parts[0].strip() == 'suterena':
            floor = -1  # Suterena is often considered as a basement floor
            max_floor = int(parts[1].strip()) if parts[1].strip().isdigit() else None
            return floor, max_floor
        return None, None

    # Handle "poddasze" followed by a slash and a number
    if 'poddasze' in floor_string:
        parts = floor_string.split('/')
        if len(parts) == 2 and parts[0].strip() == 'poddasze':
            max_floor = int(parts[1].strip()) if parts[1].strip().isdigit() else None
            floor = max_floor  # Set the floor to max_floor for "poddasze"
            return floor, max_floor
        return None, None

    # Handle floors with ">"
    if '>' in floor_string:
        parts = floor_string.replace('>', '').split('/')
        floor = int(parts[0].strip()) + 1  # Increment floor by one
        max_floor = int(parts[1].strip()) if len(parts) > 1 and parts[1].strip().isdigit() else None
    else:
        parts = floor_string.split('/')
        if len(parts) == 2:
            floor = int(parts[0].strip())
            max_floor = int(parts[1].strip()) if parts[1].strip().isdigit() else None
        else:
            return None, None  # Return None if format is unexpected

    return floor, max_floor


In [19]:
def clean_build_year(year):
    if not isinstance(year, str):  # Check if the input is not a string
        return None  # Return None for non-string inputs to handle cases like NaN or None directly
    
    year = year.strip()  # Trim whitespace from the string
    if year.isdigit():  # Check if the string is all digits
        return int(year)  # Convert to integer if it is a valid year
    elif year.lower() == 'brak informacji':  # Handle the 'no information' case
        return None  # Return None if there is no valid year information
    
    return None  # Return None for any other cases that do not match

In [20]:
def clean_rooms(rooms):
    if not isinstance(rooms, str):  # Check if the input is not a string
        return None  # Return None for non-string inputs like NaN or None directly
    
    rooms = rooms.strip()  # Trim whitespace from the string
    if rooms.isdigit():  # Check if the string is all digits
        return int(rooms)  # Convert to integer if it is a valid room number
    elif rooms.lower() == 'więcej niż 10':  # Handle the 'more than 10 rooms' case
        return 11  # Assign 11 for 'więcej niż 10'
    
    return None  # Return None for any other cases that do not match

In [21]:
# Apply the functions to columns
df['price'] = df['price'].apply(clean_price).astype('Int64')
df['area'] = df['area'].apply(clean_area)
df['district'] = df['address'].apply(extract_district)
df = df.dropna(subset=['district'])
df[['floor', 'max_floor']] = df['floor'].apply(lambda x: pd.Series(split_floor(x))).astype('Int64')
df = df.drop('address', axis=1)
df['rent'] = df['rent'].apply(clean_rent).astype('Int64')
df['build_year'] = df['build_year'].apply(clean_build_year).astype('Int64')
df['rooms'] = df['rooms'].apply(clean_rooms).astype('Int64')
print(df.head())

   id    price   area  rooms  floor  rent         outdoor  \
0   1  1070000  59.50      2      8  <NA>             NaN   
1   2     <NA>  38.04      2      0  <NA>         ogródek   
2   3  1700000  67.44      3      3  <NA>          balkon   
3   4   933000  46.00      2   <NA>   600          balkon   
4   5     <NA>  80.74      4      0  <NA>  taras, ogródek   

                    parking  build_year           status     district  \
0                       NaN        2000              NaN     Targówek   
1  garaż/miejsce parkingowe        <NA>   do wykończenia      Ursynów   
2  garaż/miejsce parkingowe        <NA>  do zamieszkania      Wilanów   
3                       NaN        <NA>              NaN  Śródmieście   
4  garaż/miejsce parkingowe        <NA>   do wykończenia     Targówek   

   max_floor  
0         11  
1          5  
2          3  
3       <NA>  
4         10  


In [22]:
# Save the cleaned data back to CSV
df.to_csv('cleaned_data.csv', sep=';', index=False)  # Saves with semicolon delimiter

print("Data cleaning complete and saved to 'cleaned_data.csv'.")

Data cleaning complete and saved to 'cleaned_data.csv'.


Collecting matplotlibNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for matplotlib from https://files.pythonhosted.org/packages/17/91/febbb6c1063ae05a62fdbe038c2917b348b1b35f0482cee4738e6870a44a/matplotlib-3.9.0-cp312-cp312-win_amd64.whl.metadata
  Downloading matplotlib-3.9.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Obtaining dependency information for contourpy>=1.0.1 from https://files.pythonhosted.org/packages/78/38/a046bb0ebce6f530175d434e7364149e338ffe1069ee286ed8ba7f6481ee/contourpy-1.2.1-cp312-cp312-win_amd64.whl.metadata
  Using cached contourpy-1.2.1-cp312-cp312-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib)
  Obtaining dependency information for cycler>=0.10 from https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl.metadata
  Using cached cycler-0.12.1-py3-none-


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
