In [34]:
import reverse_geocoder as rg
import pycountry
import numpy as np
import pandas as pd
import warnings

In [35]:
combined_df = pd.read_csv('transformed_data/intergrated_data.csv')

### Add District, State, and Country Using Reverse Geocoding

**Purpose:**  
Enrich the dataset with geographic hierarchy information by extracting district, state, and country from latitude and longitude coordinates.

**Actions Performed:**  
1. Prepare a list of coordinates (`latitude`, `longitude`) for each listing.  
2. Perform reverse geocoding using the `reverse_geocoder` library to retrieve location information.  
3. Extract:
   - `district` (`admin2`) → the district or county of the listing  
   - `state` (`admin1`) → the state, province, or region  
   - `country_code` → the ISO 2-letter country code  
4. Map the ISO country code to the full **country name** using the `pycountry` library.

**Output:**  
- `district` column: Subdivision smaller than city  
- `state` column: State or region  
- `country_code` column: ISO code of the country  
- `country_name` column: Full country name  


In [36]:

# 1. Prepare coordinates as list of tuples (lat, lon)
coordinates = list(zip(combined_df['latitude'], combined_df['longitude']))

# 2. Perform reverse geocoding
results = rg.search(coordinates)

# 3. Extract district (admin2), state (admin1), and country code
combined_df['district'] = [x['admin2'] for x in results]       # District 
combined_df['state'] = [x['admin1'] for x in results]          # State 
combined_df['country_code'] = [x['cc'] for x in results]       # Country code

# 4. Map country code to full country name
combined_df['country_name'] = combined_df['country_code'].apply(
    lambda code: pycountry.countries.get(alpha_2=code).name
)

In [37]:
combined_df

Unnamed: 0,id,price_total,room_type,is_shared_room,is_private_room,max_guests,is_superhost,is_multi_listing,is_business_listing,cleanliness_score,...,restaurant_index,restaurant_index_norm,longitude,latitude,city,day_type,district,state,country_code,country_name
0,0,194.033698,Private room,False,True,2.0,False,1,0,10.0,...,98.253896,6.846473,4.90569,52.41772,amsterdam,weekday,Gemeente Landsmeer,North Holland,NL,Netherlands
1,1,344.245776,Private room,False,True,4.0,False,0,0,8.0,...,837.280757,58.342928,4.90005,52.37432,amsterdam,weekday,Gemeente Amsterdam,North Holland,NL,Netherlands
2,2,264.101422,Private room,False,True,2.0,False,0,1,9.0,...,95.386955,6.646700,4.97512,52.36103,amsterdam,weekday,Gemeente Diemen,North Holland,NL,Netherlands
3,3,433.529398,Private room,False,True,4.0,False,0,1,9.0,...,875.033098,60.973565,4.89417,52.37663,amsterdam,weekday,Gemeente Amsterdam,North Holland,NL,Netherlands
4,4,485.552926,Private room,False,True,2.0,True,0,0,10.0,...,815.305740,56.811677,4.90051,52.37508,amsterdam,weekday,Gemeente Amsterdam,North Holland,NL,Netherlands
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51702,1794,715.938574,Entire home/apt,False,False,6.0,False,0,1,10.0,...,438.756874,10.604584,16.37940,48.21136,vienna,weekend,Wien Stadt,Vienna,AT,Austria
51703,1795,304.793960,Entire home/apt,False,False,2.0,False,0,0,8.0,...,342.182813,8.270427,16.38070,48.20296,vienna,weekend,Wien Stadt,Vienna,AT,Austria
51704,1796,637.168969,Entire home/apt,False,False,2.0,False,0,0,10.0,...,282.296424,6.822996,16.38568,48.20460,vienna,weekend,Wien Stadt,Vienna,AT,Austria
51705,1797,301.054157,Private room,False,True,2.0,False,0,0,10.0,...,158.563398,3.832416,16.34100,48.19200,vienna,weekend,Wien Stadt,Vienna,AT,Austria


### Merge Airbnb Data with Crime Data by City

**Purpose:**  
Enrich the Airbnb dataset with crime rate information for each city.

**Steps Performed:**  
1. **Read crime data** from the CSV file and remove unnecessary columns (`Unnamed: 0`).  
2. **Standardize city names** to lowercase in both datasets to ensure consistent merging.  
3. **Merge the datasets** on the `city` column using an inner join (`how='inner'`), keeping only rows that exist in both datasets.  
4. **Handle duplicates:**  
   - If the crime data has multiple rows per city, duplicates are removed or aggregated to ensure a **one-to-one merge**.  

**Output:**  
- `merged_df` contains all Airbnb listings along with the corresponding **crime rate information** for each city.  
- The row count remains consistent and does not inflate due to duplicate city entries.  


In [38]:
crime_data = pd.read_csv('External_data/Crime_rate.csv')
crime_data.drop(r'Unnamed: 0', axis=1, inplace=True)
crime_data['city'] = crime_data['city'].str.lower()
merged_df = pd.merge(combined_df, crime_data, on='city', how='inner')

In [39]:
merged_df

Unnamed: 0,id,price_total,room_type,is_shared_room,is_private_room,max_guests,is_superhost,is_multi_listing,is_business_listing,cleanliness_score,...,longitude,latitude,city,day_type,district,state,country_code,country_name,Crime_Index,Safety_Index
0,0,194.033698,Private room,False,True,2.0,False,1,0,10.0,...,4.90569,52.41772,amsterdam,weekday,Gemeente Landsmeer,North Holland,NL,Netherlands,25.7,74.3
1,1,344.245776,Private room,False,True,4.0,False,0,0,8.0,...,4.90005,52.37432,amsterdam,weekday,Gemeente Amsterdam,North Holland,NL,Netherlands,25.7,74.3
2,2,264.101422,Private room,False,True,2.0,False,0,1,9.0,...,4.97512,52.36103,amsterdam,weekday,Gemeente Diemen,North Holland,NL,Netherlands,25.7,74.3
3,3,433.529398,Private room,False,True,4.0,False,0,1,9.0,...,4.89417,52.37663,amsterdam,weekday,Gemeente Amsterdam,North Holland,NL,Netherlands,25.7,74.3
4,4,485.552926,Private room,False,True,2.0,True,0,0,10.0,...,4.90051,52.37508,amsterdam,weekday,Gemeente Amsterdam,North Holland,NL,Netherlands,25.7,74.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51702,1794,715.938574,Entire home/apt,False,False,6.0,False,0,1,10.0,...,16.37940,48.21136,vienna,weekend,Wien Stadt,Vienna,AT,Austria,28.4,71.6
51703,1795,304.793960,Entire home/apt,False,False,2.0,False,0,0,8.0,...,16.38070,48.20296,vienna,weekend,Wien Stadt,Vienna,AT,Austria,28.4,71.6
51704,1796,637.168969,Entire home/apt,False,False,2.0,False,0,0,10.0,...,16.38568,48.20460,vienna,weekend,Wien Stadt,Vienna,AT,Austria,28.4,71.6
51705,1797,301.054157,Private room,False,True,2.0,False,0,0,10.0,...,16.34100,48.19200,vienna,weekend,Wien Stadt,Vienna,AT,Austria,28.4,71.6
