### Connect to google drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install missingno
!pip install geopy

### read 2019 Berline Airbnb Dataset
This file already contains only 2019 data , data reduced due to its extream size


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 500)

df = pd.read_csv('/content/drive/MyDrive/kaggle/berline/berline_filtered_2019.csv')
pd.set_option('display.max_columns', None)



### Data Prepeartion

### feature selection  (round #1)
in this section we going to drop all the.
* redundant / duplicate data.
* drop all the column that contains same value , these data is not useful.
* drop columns that almost not containing any data.
* drop all the data the clearly will not help me during this jurney , like (Urls , reviewer  id , reviewer name ... )

In [None]:
df = df.drop(columns=['Review ID', 'Reviewer ID', 'Reviewer Name', 'Listing URL','Listing Name','Host ID', 'Host URL', 'Host Name',
                           'City', 'Country Code', 'Country','First Review', 'Last Review', 'Square Feet', 'Business Travel Ready',
            ])

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66833 entries, 0 to 66832
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Comments              66782 non-null  object 
 1   Listing ID            66833 non-null  int64  
 2   Host Since            66833 non-null  object 
 3   Host Response Time    65868 non-null  object 
 4   Host Response Rate    65868 non-null  object 
 5   Is Superhost          66833 non-null  object 
 6   neighbourhood         66833 non-null  object 
 7   Neighborhood Group    66833 non-null  object 
 8   Postal Code           65737 non-null  object 
 9   Latitude              66833 non-null  float64
 10  Longitude             66833 non-null  float64
 11  Is Exact Location     66833 non-null  object 
 12  Property Type         66833 non-null  object 
 13  Room Type             66833 non-null  object 
 14  Accomodates           66833 non-null  int64  
 15  Bathrooms          

#Replace and clean values from numbers , like $,% and more

In [None]:
# remove the $ from price
df['Price'] = df['Price'].replace('[\$,]', '', regex=True).astype(float)

# remove the % from 'Host Response Rate'
df['Host Response Rate'] = df['Host Response Rate'].replace('[\%,]', '', regex=True).astype(float)

# Fix Postal Code incorrect values, remove '\n'and other irrelevant text
df['Postal Code'] = df['Postal Code'].astype(str).str[:5]

### reduce the following wide catigories :
* Host Response Rate Grouped
* Overall Rating
* Neighbourhood Grouped
* property_types
* Postal Code

In [None]:
# Define bins and labels for 'Host Response Rate'
bins = [0, 50, 80, 95, 100]
labels = ["Low Response", "Moderate Response", "High Response", "Very High Response"]

# Apply pd.cut() to create a new binned column
df["Host Response Rate Grouped Cleansed"] = pd.cut(df["Host Response Rate"], bins=bins, labels=labels, include_lowest=True)

#  Reduce Categories
df['Overall Rating Grouped Cleansed'] = df['Overall Rating'].apply(lambda x: np.ceil(x/10))


# 2. Grouping neighbourhoods into Neighborhood Groups
neighbourhood_mapping = df.groupby('neighbourhood')['Neighborhood Group'].first()
df['Neighbourhood Grouped Cleansed'] = df['neighbourhood'].map(neighbourhood_mapping)

# 3. Reducing Property Types
property_mapping = {
    "Villa": "Vacation Rental",
    "Cottage": "Vacation Rental",
    "Bungalow": "Vacation Rental",
    "Cabin": "Vacation Rental",
    "Tiny house": "Vacation Rental",
    "Earth house": "Vacation Rental",
    "Treehouse": "Vacation Rental",
    "Hut": "Vacation Rental",
    "Barn": "Vacation Rental",
    "Houseboat": "Boats & Houseboats",
    "Boat": "Boats & Houseboats",
    "Camper/RV": "Mobile/Alternative Lodging",
    "Cave": "Mobile/Alternative Lodging",
    "Pension (South Korea)": "Mobile/Alternative Lodging",
    "Casa particular (Cuba)": "Mobile/Alternative Lodging",
}

# Apply mapping and assign 'Other' to rare categories
top_property_types = [
    "Apartment", "Loft", "House", "Townhouse", "Condominium", "Serviced apartment",
    "Hotel", "Hostel", "Guesthouse", "Bed and breakfast", "Boutique hotel"
]

df['Property Type Cleansed'] = df['Property Type'].apply(
    lambda x: property_mapping.get(x, x) if x in top_property_types or x in property_mapping else "Other"
)

# 4. Binning Postal Codes (first two digits represent broad area)
df['Postal Code Cleansed'] = df['Postal Code'].astype(str).str[:2]  # Use only first 2 digits

### Transform data:

In [None]:
# Extracting years from date columns
df['Host Since'] = pd.to_datetime(df['Host Since'])
df['Host Since Year Cleansed'] = df['Host Since'].dt.year

#transform true/false into bool
df['Instant Bookable Cleansed'] = df['Instant Bookable'].replace({'t': True, 'f': False})
df['Is Superhost Cleansed'] = df['Is Superhost'].replace({'t': True, 'f': False})
df['Is Exact Location Cleansed'] = df['Is Exact Location'].replace({'t': True, 'f': False})
df['Instant Bookable Cleansed'] = df['Instant Bookable'].replace({'t': True, 'f': False})

### Generating new features from existing data:
Calculate the distance of each listing from Berlin's center and join each listing to a group.
Each group will contain listings that are within a specific distance from the center

In [None]:
from geopy.distance import great_circle
def distance_to_mid(lat, lon):
    berlin_centre = (52.5027778, 13.404166666666667)
    accommodation = (lat, lon)
    return great_circle(berlin_centre, accommodation).km

df['Distance From Center Cleansed'] = df.apply(lambda row: round(distance_to_mid(row['Latitude'], row['Longitude']), 1), axis=1)
bins = [0, 0.5, 1, 2, 4, 8, 16]
labels = ["Center", "Center1", "Center2", "Center4", "Center8", "Center16"]

# Apply pd.cut() to create a new binned column
df["Distance From Center Grouped Cleansed"] = pd.cut(df["Distance From Center Cleansed"], bins=bins, labels=labels, include_lowest=True)

### cleaning data (Round: #2)
dropping the following feature , after using them to calculate new clean features.

In [None]:
df.drop(columns=['Host Since', 'neighbourhood', 'Latitude', 'Longitude', 'Property Type', 'Postal Code',
                 'Host Response Rate', 'Overall Rating','Instant Bookable', 'Is Superhost',
                 'Is Exact Location','Destance From Center Cleansed',
                 'Comments'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66833 entries, 0 to 66832
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   Listing ID                             66833 non-null  int64   
 1   Host Response Time                     65868 non-null  object  
 2   Neighborhood Group                     66833 non-null  object  
 3   Room Type                              66833 non-null  object  
 4   Accomodates                            66833 non-null  int64   
 5   Bathrooms                              66791 non-null  float64 
 6   Bedrooms                               66763 non-null  float64 
 7   Beds                                   66829 non-null  float64 
 8   Price                                  66833 non-null  float64 
 9   Guests Included                        66833 non-null  int64   
 10  Min Nights                             66833 non-null  int