In [22]:
import pandas as pd 
from joblib import load, dump 

In [8]:
airbnb = pd.read_csv('airbnb_hotels.csv')
airbnb.drop(airbnb.columns[0], axis=1, inplace=True)

In [9]:
tripadvisor = pd.read_csv('tripadvisor_hotels.csv')
tripadvisor.drop(tripadvisor.columns[0], axis=1, inplace=True)

In [12]:
airbnb.columns

Index(['Airbnb_Name', 'Price', 'Primary_Rating', 'Location_Rating',
       'Cleanliness_Rating', 'Value_Rating', 'Communication_Rating',
       'Accuracy_Rating', 'Checkin_Rating', 'Num_Reviews', 'Neighborhood',
       'Amenities'],
      dtype='object')

In [14]:
tripadvisor.columns

Index(['Hotel_Name', 'Price', 'Primary_Rating', 'Location_Rating',
       'Cleanliness_Rating', 'Service_Rating', 'Value_Rating', 'Num_Reviews',
       'Address', 'Amenities'],
      dtype='object')

# Combine DataFrames

1. Find common columns
2. Omit the rest
3. If an address is present, use it, otherwise use the neighborhood
4. Add a column indicating if the listing is a hotel or airbnb

In [40]:
combined_df = pd.DataFrame(columns=['Airbnb_Or_Hotel',
                           'Listing_Name', 
                           'Price', 
                           'Primary_Rating', 
                           'Location_Rating', 
                           'Cleanliness_Rating',
                           'Value_Rating',
                           'Num_Reviews',
                           'Address',
                           'Neighborhood',
                           'Amenities',
                                   ])
for i, row in tripadvisor.iterrows():
    combined_df = combined_df.append({
        'Airbnb_Or_Hotel': 'Hotel',
        'Listing_Name': row['Hotel_Name'],
        'Price': row['Price'],
        'Primary_Rating': row['Primary_Rating'],
        'Location_Rating': row['Location_Rating'],
        'Cleanliness_Rating': row['Cleanliness_Rating'],
        'Value_Rating': row['Value_Rating'],
        'Num_Reviews': row['Num_Reviews'],
        'Address': row['Address'],
        'Amenities': row['Amenities']
    }, ignore_index=True)
    
for i, row in airbnb.iterrows():
    combined_df = combined_df.append({
        'Airbnb_Or_Hotel': 'Airbnb',
        'Listing_Name': row['Airbnb_Name'],
        'Price': row['Price'],
        'Primary_Rating': row['Primary_Rating'],
        'Location_Rating': row['Location_Rating'],
        'Cleanliness_Rating': row['Cleanliness_Rating'],
        'Value_Rating': row['Value_Rating'],
        'Num_Reviews': row['Num_Reviews'],
        'Neighborhood': row['Neighborhood'],
        'Amenities': row['Amenities']
    }, ignore_index=True)

In [41]:
combined_df.tail()
dump(combined_df, 'raw_combined_df.pkl', compress=True)

['raw_combined_df.pkl']

In [42]:
combined_df = load('raw_combined_df.pkl')
# Remove listings that do not have a price, neighborhood, or address
no_price_removed = combined_df[combined_df['Price'] != 0]
no_price_removed.head()

Unnamed: 0,Airbnb_Or_Hotel,Listing_Name,Price,Primary_Rating,Location_Rating,Cleanliness_Rating,Value_Rating,Num_Reviews,Address,Neighborhood,Amenities
0,Hotel,The Hotel At Times Square,68,4.0,5.0,4.0,4.0,4140,"59 West 46th Street, New York City, NY 10036-4120",,"['Paid public parking nearby', 'Free High Spee..."
1,Hotel,Hotel Edison,75,4.0,5.0,4.0,4.0,13283,228 West 47th Street Between Broadway and 8 Av...,,"['Paid private parking nearby', 'Wifi', 'Fitne..."
2,Hotel,Park Lane Hotel,130,4.0,5.0,4.0,4.0,9391,36 Central Park South 59th Street Between 5th ...,,"['Paid public parking nearby', 'Wifi', 'Fitnes..."
3,Hotel,Crowne Plaza Times Square Manhattan,89,4.0,5.0,4.5,4.0,11937,"1605 Broadway Between 48th and 49th Street, Ne...",,"['Valet parking', 'Free internet', 'Fitness Ce..."
4,Hotel,The New Yorker A Wyndham Hotel,80,4.0,4.5,4.0,4.0,14918,"481 8th Avenue & 34th Street, New York City, N...",,"['Paid private parking on-site', 'Free interne..."


In [43]:
clean_df = no_price_removed[(no_price_removed['Address'] != None) & (no_price_removed['Neighborhood'] != None)]

In [46]:
dump(clean_df, 'cleaned_combined_df.pkl', compress=True)

['cleaned_combined_df.pkl']