In [18]:
import pandas as pd
pd.options.display.max_rows = 6000

listings = pd.read_csv('listings.csv')

In [19]:
# columns

[i for i in listings.columns]

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',


In [20]:
# first 5 rows of listings

listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,18666,https://www.airbnb.com/rooms/18666,20190710134514,2019-07-10,Flat with Sunny Terrace,"Apartment located near the ""Plaza de las Glori...",Nice apartment situated on the penthouse floor...,"Apartment located near the ""Plaza de las Glori...",none,Apartment in Barcelona near to the Plaza de la...,...,f,f,flexible,f,f,31,31,0,0,0.02
1,18674,https://www.airbnb.com/rooms/18674,20190710134514,2019-07-10,Huge flat for 8 people close to Sagrada Familia,110m2 apartment to rent in Barcelona. Located ...,Apartment with 110 m2 located in the 6th floor...,110m2 apartment to rent in Barcelona. Located ...,none,Apartment in Barcelona located in the heart of...,...,t,f,strict_14_with_grace_period,f,f,31,31,0,0,0.2
2,21605,https://www.airbnb.com/rooms/21605,20190710134514,2019-07-10,Nice and sunny duble room,"The flat is in Poblenou district, and the room...","L'apartament està al barri de Poblenou, i l'ha...","The flat is in Poblenou district, and the room...",none,Poblenou as one of the few areas that has grow...,...,f,f,moderate,f,f,2,0,2,0,3.08
3,23197,https://www.airbnb.com/rooms/23197,20190710134514,2019-07-10,FORUM DELUXE 5 MINS WALK CCIB CENTER & SEA!,I do not accept groups of young people under 2...,Elegant spacious apartment suitable for 6. Amp...,I do not accept groups of young people under 2...,none,Strategically located in the area of Parc del ...,...,t,f,strict_14_with_grace_period,f,t,2,2,0,0,0.44
4,25786,https://www.airbnb.com/rooms/25786,20190710134514,2019-07-10,NICE ROOM AVAILABLE IN THE HEART OF GRACIA,JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH...,Room available for rent.- PEDRO PEREZ. Share...,JUST GO THROUGH THE MANY REVIEWS I GOT THROUGH...,none,Solo decir que a menudo ni salgo del barrio. M...,...,t,f,strict_14_with_grace_period,t,t,1,0,1,0,2.22


In [21]:
# determining missingness among columns

def missingness(df):
    return df.isna().sum().sort_values(ascending = False) / len(df) * 100

missingness(listings)

host_acceptance_rate                            100.000000
xl_picture_url                                  100.000000
medium_url                                      100.000000
jurisdiction_names                              100.000000
thumbnail_url                                   100.000000
square_feet                                      97.357939
weekly_price                                     93.409973
monthly_price                                    92.633490
notes                                            52.170625
access                                           46.029345
interaction                                      43.316694
transit                                          39.701508
license                                          38.456109
host_about                                       35.909847
neighborhood_overview                            34.256038
house_rules                                      32.597187
security_deposit                                 28.8962

In [22]:
# drop primarily missing columns

listings = listings.drop(columns = ['host_acceptance_rate', 'xl_picture_url', 'medium_url', 'jurisdiction_names', 'thumbnail_url', 'square_feet'])

In [23]:
# determining repeating columns

def repeating_cols(df):
    comparisons = []
    match_ratios = []
    
    for i in range(len(df.columns)):
        for j in range(i + 1, len(df.columns)):
            comparisons_i = [df.columns[i], df.columns[j]]
            comparisons.append(comparisons_i)
    
    for i in comparisons:
        match_ratios_i = [(df[i[0]] == df[i[1]]).sum() / len(df) * 100, i[0] + ' - ' + i[1]]
        match_ratios.append(match_ratios_i)
    match_ratios.sort(reverse = True)
    
    return match_ratios

repeating_cols(listings)

[[100.0, 'last_scraped - calendar_last_scraped'],
 [100.0, 'has_availability - requires_license'],
 [99.82856854737054, 'host_listings_count - host_total_listings_count'],
 [99.61680028235769, 'host_has_profile_pic - requires_license'],
 [99.61680028235769, 'host_has_profile_pic - has_availability'],
 [98.63359048051228,
  'is_business_travel_ready - require_guest_profile_picture'],
 [98.44703272323905, 'maximum_nights - maximum_maximum_nights'],
 [98.19996974739071, 'minimum_maximum_nights - maximum_nights_avg_ntm'],
 [98.16467503655524, 'minimum_maximum_nights - maximum_maximum_nights'],
 [98.16467503655524, 'maximum_maximum_nights - maximum_nights_avg_ntm'],
 [97.53441234306459, 'maximum_nights - minimum_maximum_nights'],
 [97.51424393687289,
  'require_guest_profile_picture - require_guest_phone_verification'],
 [97.23188625018908, 'maximum_nights - maximum_nights_avg_ntm'],
 [96.86885493873847, 'city - market'],
 [96.51086572883578,
  'is_business_travel_ready - require_guest_phon

In [24]:
# drop primarily missing columns

listings = listings.drop(columns = ['calendar_last_scraped', 'requires_license', 'host_total_listings_count', 'requires_license', 'has_availability'])
repeating_cols(listings)

[[98.63359048051228,
  'is_business_travel_ready - require_guest_profile_picture'],
 [98.44703272323905, 'maximum_nights - maximum_maximum_nights'],
 [98.19996974739071, 'minimum_maximum_nights - maximum_nights_avg_ntm'],
 [98.16467503655524, 'minimum_maximum_nights - maximum_maximum_nights'],
 [98.16467503655524, 'maximum_maximum_nights - maximum_nights_avg_ntm'],
 [97.53441234306459, 'maximum_nights - minimum_maximum_nights'],
 [97.51424393687289,
  'require_guest_profile_picture - require_guest_phone_verification'],
 [97.23188625018908, 'maximum_nights - maximum_nights_avg_ntm'],
 [96.86885493873847, 'city - market'],
 [96.51086572883578,
  'is_business_travel_ready - require_guest_phone_verification'],
 [87.49558816114556, 'minimum_nights - minimum_minimum_nights'],
 [81.16270861695153, 'host_is_superhost - is_business_travel_ready'],
 [80.93581404729491, 'host_is_superhost - require_guest_profile_picture'],
 [79.86688851913478, 'host_is_superhost - require_guest_phone_verification

In [25]:
listings.to_csv('listings-clean.csv', index = True)

In [26]:
listings.to_json('listings.json', orient = 'split', index = False)