## Data Integration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

### Configure Display Settings and Warnings

**Purpose:**  
Improve notebook readability and output clarity by adjusting Pandas display options and suppressing unnecessary warnings.

**Actions Performed:**  
- Display all DataFrame columns without truncation  
- Automatically adjust output width to fit the screen  
- Prevent column wrapping for wide DataFrames  
- Suppress warning messages to keep outputs clean

**Impact:**  
Ensures clearer DataFrame visualization and a more focused analysis experience.


In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)          # Show all columns
pd.set_option('display.width', None)                # Adjust width dynamically
pd.set_option('display.expand_frame_repr', False)   # Prevent column wrapping
warnings.filterwarnings('ignore')                   # Ignores all warnings                               # Shows loading bar

### Combine Daily CSV Files Using Glob

**Purpose:**  
Automatically load and combine multiple CSV files from a directory into a single DataFrame while enriching the data with contextual metadata.

**Actions Performed:**  
- Use `glob` to locate all CSV files in the specified directory  
- Read each file into a temporary DataFrame  
- Set the first column as the index and rename it to `id`  
- Extract the country name from the file name  
- Add a `day_type` column based on whether the file represents weekdays or weekends  
- Concatenate all processed DataFrames into one unified DataFrame



## Data Integration

In [2]:
import glob
import os

path = 'airbnb_data'
csv_files = glob.glob(os.path.join(path, '*.csv'))

combined_df = pd.DataFrame()

for file in csv_files:
    # Read CSV file
    temp_df = pd.read_csv(file)
    
    # Set first column as index
    temp_df.set_index(temp_df.columns[0], inplace=True)
    temp_df.index.name = 'id'
    
    # Extract country name from file name
    country_name = os.path.basename(file).split('_')[0]
    temp_df['country'] = country_name
    
    # Add day type based on file name
    if 'weekend' in file.lower():
        temp_df['day_type'] = 'weekend'
    else:
        temp_df['day_type'] = 'weekday'
    
    # Append to combined DataFrame
    combined_df = pd.concat([combined_df, temp_df])


NameError: name 'pd' is not defined

In [None]:
combined_df

Unnamed: 0_level_0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,country,day_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,194.033698,Private room,False,True,2.0,False,1,0,10.0,93.0,1,5.022964,2.539380,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam,weekday
1,344.245776,Private room,False,True,4.0,False,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam,weekday
2,264.101422,Private room,False,True,2.0,False,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,amsterdam,weekday
3,433.529398,Private room,False,True,4.0,False,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam,weekday
4,485.552926,Private room,False,True,2.0,True,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,amsterdam,weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,715.938574,Entire home/apt,False,False,6.0,False,0,1,10.0,100.0,3,0.530181,0.135447,219.402478,15.712158,438.756874,10.604584,16.37940,48.21136,vienna,weekend
1795,304.793960,Entire home/apt,False,False,2.0,False,0,0,8.0,86.0,1,0.810205,0.100839,204.970121,14.678608,342.182813,8.270427,16.38070,48.20296,vienna,weekend
1796,637.168969,Entire home/apt,False,False,2.0,False,0,0,10.0,93.0,1,0.994051,0.202539,169.073402,12.107921,282.296424,6.822996,16.38568,48.20460,vienna,weekend
1797,301.054157,Private room,False,True,2.0,False,0,0,10.0,87.0,1,3.044100,0.287435,109.236574,7.822803,158.563398,3.832416,16.34100,48.19200,vienna,weekend


In [None]:
combined_df.rename(
    columns={
        'realSum': 'price_total',
        'room_shared': 'is_shared_room',
        'room_private': 'is_private_room',
        'person_capacity': 'max_guests',
        'host_is_superhost': 'is_superhost',
        'multi': 'is_multi_listing',
        'biz': 'is_business_listing',
        'cleanliness_rating': 'cleanliness_score',
        'guest_satisfaction_overall': 'guest_satisfaction_score',
        'bedrooms': 'num_bedrooms',
        'dist': 'distance_city_center',
        'metro_dist': 'distance_metro',
        'attr_index': 'attraction_index',
        'attr_index_norm': 'attraction_index_norm',
        'rest_index': 'restaurant_index',
        'rest_index_norm': 'restaurant_index_norm',
        'lng': 'longitude',
        'lat': 'latitude',
        'country': 'city'
    },
    inplace=True
)

 Data Dictionary

| Column Name | Description |
|------------|------------|
| `id` | Unique identifier for each listing |
| `price_total` | Total price of the listing per stay |
| `room_type` | Type of accommodation (e.g., Private room, Entire home) |
| `is_shared_room` | Indicates whether the room is shared with others |
| `is_private_room` | Indicates whether the room is private |
| `max_guests` | Maximum number of guests the listing can accommodate |
| `is_superhost` | Airbnb rating |
| `is_multi_listing` | Indicates whether the host manages multiple listings |
| `is_business_listing` | Indicates whether the listing is suitable for business travel |
| `cleanliness_score` | Cleanliness rating given by guests |
| `guest_satisfaction_score` | Overall guest satisfaction rating |
| `num_bedrooms` | Number of bedrooms in the listing |
| `distance_city_center` | Distance of the listing from the city center |
| `distance_metro` | Distance from the nearest metro station |
| `attraction_index` | Index measuring proximity to tourist attractions |
| `attraction_index_norm` | Normalized attraction proximity score |
| `restaurant_index` | Index measuring proximity to restaurants |
| `restaurant_index_norm` | Normalized restaurant proximity score |
| `longitude` | Longitude coordinate of the listing |
| `latitude` | Latitude coordinate of the listing |
| `city` | city where the listing is located |
| `day_type` | Indicates whether the data corresponds to a weekday or weekend |

**Notes:**  
- Boolean columns (`is_*`) use `True` / `False` values  
- Distance values are relative metrics used for comparison within the city  
- Normalized indices are scaled versions of their original metrics


In [None]:
combined_df.describe()

Unnamed: 0,price_total,max_guests,is_multi_listing,is_business_listing,cleanliness_score,guest_satisfaction_score,num_bedrooms,distance_city_center,distance_metro,attraction_index,attraction_index_norm,restaurant_index,restaurant_index_norm,longitude,latitude
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,279.879591,3.161661,0.291353,0.350204,9.390624,92.628232,1.15876,3.191285,0.68154,294.204105,13.423792,626.856696,22.786177,7.426068,45.671128
std,327.948386,1.298545,0.45439,0.477038,0.954868,8.945531,0.62741,2.393803,0.858023,224.754123,9.807985,497.920226,17.804096,9.799725,5.249263
min,34.779339,2.0,0.0,0.0,2.0,20.0,0.0,0.015045,0.002301,15.152201,0.926301,19.576924,0.592757,-9.22634,37.953
25%,148.752174,2.0,0.0,0.0,9.0,90.0,1.0,1.453142,0.24848,136.797385,6.380926,250.854114,8.75148,-0.0725,41.39951
50%,211.343089,3.0,0.0,0.0,10.0,95.0,1.0,2.613538,0.413269,234.331748,11.468305,522.052783,17.542238,4.873,47.50669
75%,319.694287,4.0,1.0,1.0,10.0,99.0,1.0,4.263077,0.73784,385.756381,17.415082,832.628988,32.964603,13.518825,51.471885
max,18545.450285,6.0,1.0,1.0,10.0,100.0,10.0,25.284557,14.273577,4513.563486,100.0,6696.156772,100.0,23.78602,52.64141


In [None]:
combined_df

Unnamed: 0_level_0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,country,day_type,city,country_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,194.033698,Private room,False,True,2.0,False,1,0,10.0,93.0,1,5.022964,2.539380,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam,weekday,Landsmeer,NL
1,344.245776,Private room,False,True,4.0,False,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam,weekday,Amsterdam,NL
2,264.101422,Private room,False,True,2.0,False,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,amsterdam,weekday,Diemen,NL
3,433.529398,Private room,False,True,4.0,False,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam,weekday,Amsterdam,NL
4,485.552926,Private room,False,True,2.0,True,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,amsterdam,weekday,Amsterdam,NL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,715.938574,Entire home/apt,False,False,6.0,False,0,1,10.0,100.0,3,0.530181,0.135447,219.402478,15.712158,438.756874,10.604584,16.37940,48.21136,vienna,weekend,Vienna,AT
1795,304.793960,Entire home/apt,False,False,2.0,False,0,0,8.0,86.0,1,0.810205,0.100839,204.970121,14.678608,342.182813,8.270427,16.38070,48.20296,vienna,weekend,Vienna,AT
1796,637.168969,Entire home/apt,False,False,2.0,False,0,0,10.0,93.0,1,0.994051,0.202539,169.073402,12.107921,282.296424,6.822996,16.38568,48.20460,vienna,weekend,Vienna,AT
1797,301.054157,Private room,False,True,2.0,False,0,0,10.0,87.0,1,3.044100,0.287435,109.236574,7.822803,158.563398,3.832416,16.34100,48.19200,vienna,weekend,Vienna,AT


In [None]:
combined_df.to_csv("transformed_data/intergrated_data.csv")