In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
bcn_original = pd.read_csv('../data/listings_bcn.csv')
scraped_host_type = pd.read_csv('../data/airbnb2.csv')

In [3]:
scraped_host_type.rename(columns = {'url':'listing_url'}, inplace = True)

In [4]:
bcn=bcn_original.merge(scraped_host_type, on='listing_url', how='outer')

In [5]:
bcn = bcn.drop(['Unnamed: 0'], axis=1)

In [6]:
# removing all rows with NaN values in host_type column
bcn = bcn.dropna(subset=['host_type'])

In [12]:
# removing all rows with not_found values in host_type column
bcn = bcn[bcn['host_type']!='not_found']

In [17]:
bcn['price'] = bcn['price'].str[1:]
bcn['price'] = bcn['price'].str.replace(',','')
bcn['price'] = pd.to_numeric(bcn['price'])
bcn['price'].describe()

count    11340.000000
mean       160.578042
std       1116.507782
min          0.000000
25%         50.000000
50%        100.000000
75%        165.000000
max      90000.000000
Name: price, dtype: float64

In [19]:
def license_duplicates(array):
    new_arr = []
    for license in array:
        if 'HUTB' in license:
            new_arr.append(license)
        elif 'HB' in license:
            new_arr.append(license)
        elif 'AJ' in license:
            new_arr.append(license)
    return new_arr

In [20]:
duplicate_license = bcn[bcn.duplicated('license', keep=False)].copy()
duplicate_license = duplicate_license['license'].dropna()
duplicate_license_array = duplicate_license.unique()
arr = duplicate_license_array[duplicate_license_array != 'Exempt']

In [21]:
license_duplicates = license_duplicates(arr)

In [22]:
bcn['license'] = bcn['license'].fillna('no license')

In [23]:
def is_valid_license(license):
    if license in license_duplicates:
        return 'reuse'
    if 'HUTB' in license:
        return 'ok'
    if 'HB' in license:
        return 'ok'
    if 'AJ' in license:
        return 'ok'
    elif 'exempt' in license.lower():
        return 'claims exempt'
    else:
        return 'no license'

In [24]:
bcn['status_license'] = bcn.apply(lambda row: is_valid_license(row['license']), axis=1)

In [25]:
bcn['total_listings'] = bcn.groupby('host_id')['id'].transform('size')

In [26]:
def multi_listings(listings):
    if listings == 1:
        return 'single listing'
    if listings > 1 and listings < 4:
        return '2-3 listings'
    if listings > 3 and listings < 6:
        return '4-5 listings'
    if listings > 5 and listings < 10:
        return '6-9 listings'
    if listings > 9 and listings < 20:
        return '10-19 listings'
    if listings > 19 and listings < 50:
        return '20-49 listings'
    if listings > 49:
        return '50 or more listings'

In [27]:
bcn['multi_listings'] = bcn.apply(lambda row: multi_listings(row['total_listings']), axis=1)

In [29]:
bcn.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [31]:
bcn_bl = bcn.drop(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'neighbourhood', 'neighbourhood_cleansed', 'bathrooms', 'minimum_nights', 'maximum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'calendar_last_scraped', 'calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms', 'host_listings_count',
       'host_total_listings_count'], axis=1)

In [33]:
bcn_bl.shape

(11340, 43)

## Add categories:


**OK:**
- claim commercial, is commercial, everything ok
- claim private, is private, everything ok
- no license or claim exempt, but don't offer short-term-stay (<32 days), therefore ok

**NOT OK:**
- no license or claim exempt, but offers short-term-stay(<32 days), therefore not ok
- reuse of license
- claim private, but seemingly commercial
- multiple conditions which are not ok

### alternatively:
**suspicous** vs. **compliant**

#### To do:
- define commercial vs. private, add column with classification according to our definition
- define conditions per category and add new column with category
- train test split
- preprocessing
- baseline model