# Loading Data

Load modules:

In [1]:
import pandas as pd
import numpy as np

In [2]:
from pprint import pprint as pp

Load "Asheville-listings.csv":

In [3]:
listings_df = pd.read_csv("datasets/Asheville/Asheville-listings.csv")

In [4]:
listings_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_searched,last_scraped,name,summary,space,description,experiences_offered,...,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,region_id,region_name,region_parent_id,calculated_host_listings_count,reviews_per_month
0,665257,https://www.airbnb.com/rooms/665257,20160417215614,2016-04-18,2016-04-22,"Lovely cabin, great Asheville spot","You will be staying in a lovely, bright 1-bedr...","You will be staying in a lovely, bright 1-bedr...","You will be staying in a lovely, bright 1-bedr...",none,...,"NORTH CAROLINA, NORTH CAROLINA, BUNCOMBE",f,moderate,f,f,21,Buncombe,37,,1.06
1,2746729,https://www.airbnb.com/rooms/2746729,20160417215614,2016-04-19,2016-04-22,Glamping w HOT TUB + AC!,"Our business is called ""Asheville Glamping"" Fo...",Please note that this tent does not retain hea...,"Our business is called ""Asheville Glamping"" Fo...",none,...,"NORTH CAROLINA, NORTH CAROLINA, BUNCOMBE",f,strict,f,f,21,Buncombe,37,,2.69
2,6919450,https://www.airbnb.com/rooms/6919450,20160417215614,2016-04-19,2016-04-22,"Good size bedroom, private",Asheville is a great place to visit. If you a...,,Asheville is a great place to visit. If you a...,none,...,"NORTH CAROLINA, NORTH CAROLINA, BUNCOMBE",f,flexible,f,f,21,Buncombe,37,,
3,12286328,https://www.airbnb.com/rooms/12286328,20160417215614,2016-04-18,2016-04-22,Mixed Dorm,Six bed mixed dorm in hostel. Easy walk to res...,,Six bed mixed dorm in hostel. Easy walk to res...,none,...,"NORTH CAROLINA, NORTH CAROLINA, BUNCOMBE",f,moderate,f,f,21,Buncombe,37,,
4,156926,https://www.airbnb.com/rooms/156926,20160417215614,2016-04-18,2016-04-22,Mixed Dorm Bunk at BPS Hostel,,Dorm bunks at BonPaul and Sharky's Hostel. We ...,Dorm bunks at BonPaul and Sharky's Hostel. We ...,none,...,"NORTH CAROLINA, NORTH CAROLINA, BUNCOMBE",f,moderate,f,f,21,Buncombe,37,,2.02


In [5]:
pp(list(listings_df.columns))

['id',
 'listing_url',
 'scrape_id',
 'last_searched',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price

# Data Cleaning

## Trying to turn list of amenities into 0-1 vectors 

We gonna work on this column:

In [6]:
amenities_col = listings_df['amenities']
amenities_col.head()

0    {"Wireless Internet","Air Conditioning",Kitche...
1    {TV,"Cable TV",Internet,"Wireless Internet","A...
2    {TV,"Wireless Internet","Air Conditioning",Kit...
3    {Internet,"Wireless Internet","Air Conditionin...
4    {Internet,"Wireless Internet","Free Parking on...
Name: amenities, dtype: object

### Using the CSV Method

Data are of string type, looks like CSV string:

In [7]:
string = amenities_col[1][1:-1]
print string

TV,"Cable TV",Internet,"Wireless Internet","Air Conditioning",Kitchen,"Free Parking on Premises",Dog(s),"Hot Tub",Heating,"Family/Kid Friendly"


Load csv module then:

In [8]:
import csv

In [9]:
for row in csv.reader([string]):
    print ' | '.join(row)

TV | Cable TV | Internet | Wireless Internet | Air Conditioning | Kitchen | Free Parking on Premises | Dog(s) | Hot Tub | Heating | Family/Kid Friendly


It works! Now let's do it on all rows:

In [10]:
# first, get rid of all the "{}"s:
if amenities_col[0].startswith('{'):
    amenities_col = amenities_col.str.slice(1,-1)

In [11]:
amenities_lists = [row for row in csv.reader(amenities_col)]

In [12]:
amenities_lists

[['Wireless Internet',
  'Air Conditioning',
  'Kitchen',
  'Free Parking on Premises',
  'Pets Allowed',
  'Pets live on this property',
  'Other pet(s)',
  'Indoor Fireplace',
  'Heating'],
 ['TV',
  'Cable TV',
  'Internet',
  'Wireless Internet',
  'Air Conditioning',
  'Kitchen',
  'Free Parking on Premises',
  'Dog(s)',
  'Hot Tub',
  'Heating',
  'Family/Kid Friendly'],
 ['TV',
  'Wireless Internet',
  'Air Conditioning',
  'Kitchen',
  'Free Parking on Premises',
  'Pets Allowed',
  'Pets live on this property',
  'Heating',
  'Family/Kid Friendly',
  'Washer',
  'Dryer',
  'Smoke Detector',
  'Carbon Monoxide Detector',
  'Fire Extinguisher',
  'Essentials',
  'Shampoo'],
 ['Internet',
  'Wireless Internet',
  'Air Conditioning',
  'Kitchen',
  'Free Parking on Premises',
  'Breakfast',
  'Heating',
  'Smoke Detector',
  'First Aid Kit',
  'Fire Extinguisher',
  'translation missing: en.hosting_amenity_49',
  'translation missing: en.hosting_amenity_50'],
 ['Internet',
  'Wire

### Using the CountVectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
vectorizer = CountVectorizer(token_pattern = ur'(?!,|$)(.+?)(?=,|$)')

In [15]:
# first, get rid of all the '"'s:
amenities_col = amenities_col.str.replace('"', '')

In [16]:
amenities_col.head()

0    Wireless Internet,Air Conditioning,Kitchen,Fre...
1    TV,Cable TV,Internet,Wireless Internet,Air Con...
2    TV,Wireless Internet,Air Conditioning,Kitchen,...
3    Internet,Wireless Internet,Air Conditioning,Ki...
4    Internet,Wireless Internet,Free Parking on Pre...
Name: amenities, dtype: object

Now actually transform the data:

In [17]:
X = vectorizer.fit_transform(amenities_col)

In [18]:
labels = vectorizer.get_feature_names()

Check that the inverse should work:

In [19]:
vectorizer.inverse_transform(X)[:3]

[array([u'heating', u'indoor fireplace', u'other pet(s)',
        u'pets live on this property', u'pets allowed',
        u'free parking on premises', u'kitchen', u'air conditioning',
        u'wireless internet'],
       dtype='<U42'),
 array([u'family/kid friendly', u'hot tub', u'dog(s)', u'internet',
        u'cable tv', u'tv', u'heating', u'free parking on premises',
        u'kitchen', u'air conditioning', u'wireless internet'],
       dtype='<U42'),
 array([u'shampoo', u'essentials', u'fire extinguisher',
        u'carbon monoxide detector', u'smoke detector', u'dryer', u'washer',
        u'family/kid friendly', u'tv', u'heating',
        u'pets live on this property', u'pets allowed',
        u'free parking on premises', u'kitchen', u'air conditioning',
        u'wireless internet'],
       dtype='<U42')]

Great. We can save the data now:

In [20]:
from scipy.sparse import save_npz
save_npz('datasets/Asheville/Asheville-listings-amenities.npz', X)

In [21]:
X

<864x42 sparse matrix of type '<type 'numpy.int64'>'
	with 14285 stored elements in Compressed Sparse Row format>

In [22]:
#!pip install --upgrade scipy

Data in this sparse matrix look like:

In [25]:
vectors = X.todense()
vectors

matrix([[0, 1, 0, ..., 0, 0, 1],
        [0, 1, 0, ..., 0, 0, 1],
        [0, 1, 0, ..., 1, 0, 1],
        ..., 
        [0, 1, 0, ..., 0, 0, 1],
        [0, 1, 0, ..., 1, 0, 1],
        [0, 1, 0, ..., 0, 0, 1]])

In [31]:
listings_df.drop('amenities', axis=1, inplace=True)

In [36]:
vectors_df = pd.DataFrame(vectors, columns = ['AMN_'+label for label in labels])

In [39]:
listings_AMN_df = pd.concat([listings_df, vectors_df], axis=1)

In [40]:
listings_AMN_df.to_csv('datasets/Asheville/Asheville-listings-with-amenities.csv')

In [41]:
listings_AMN_df

Unnamed: 0,id,listing_url,scrape_id,last_searched,last_scraped,name,summary,space,description,experiences_offered,...,AMN_shampoo,AMN_smoke detector,AMN_smoking allowed,AMN_suitable for events,AMN_translation missing: en.hosting_amenity_49,AMN_translation missing: en.hosting_amenity_50,AMN_tv,AMN_washer,AMN_wheelchair accessible,AMN_wireless internet
0,665257,https://www.airbnb.com/rooms/665257,20160417215614,2016-04-18,2016-04-22,"Lovely cabin, great Asheville spot","You will be staying in a lovely, bright 1-bedr...","You will be staying in a lovely, bright 1-bedr...","You will be staying in a lovely, bright 1-bedr...",none,...,0,0,0,0,0,0,0,0,0,1
1,2746729,https://www.airbnb.com/rooms/2746729,20160417215614,2016-04-19,2016-04-22,Glamping w HOT TUB + AC!,"Our business is called ""Asheville Glamping"" Fo...",Please note that this tent does not retain hea...,"Our business is called ""Asheville Glamping"" Fo...",none,...,0,0,0,0,0,0,1,0,0,1
2,6919450,https://www.airbnb.com/rooms/6919450,20160417215614,2016-04-19,2016-04-22,"Good size bedroom, private",Asheville is a great place to visit. If you a...,,Asheville is a great place to visit. If you a...,none,...,1,1,0,0,0,0,1,1,0,1
3,12286328,https://www.airbnb.com/rooms/12286328,20160417215614,2016-04-18,2016-04-22,Mixed Dorm,Six bed mixed dorm in hostel. Easy walk to res...,,Six bed mixed dorm in hostel. Easy walk to res...,none,...,0,1,0,0,1,1,0,0,0,1
4,156926,https://www.airbnb.com/rooms/156926,20160417215614,2016-04-18,2016-04-22,Mixed Dorm Bunk at BPS Hostel,,Dorm bunks at BonPaul and Sharky's Hostel. We ...,Dorm bunks at BonPaul and Sharky's Hostel. We ...,none,...,0,0,1,1,0,0,0,0,0,1
5,3767793,https://www.airbnb.com/rooms/3767793,20160417215614,2016-04-18,2016-04-22,Modern Home - Views - HOT TUB,Don't miss out on this modern home with luxuri...,I will not accept reservations more than 30 da...,Don't miss out on this modern home with luxuri...,none,...,0,1,0,1,0,0,1,1,0,1
6,5927700,https://www.airbnb.com/rooms/5927700,20160417215614,2016-04-18,2016-04-22,Bright Suite with Sunroom,Urban farm/Private Suite in our home is 3min f...,Bedroom has a private outdoor entrance off the...,Urban farm/Private Suite in our home is 3min f...,none,...,0,1,0,0,0,0,0,0,0,1
7,6698737,https://www.airbnb.com/rooms/6698737,20160417215614,2016-04-18,2016-04-22,"Serene, spacious, cozy and clean!",Mountains! Music! Beer! Eats! Make yourself at...,"The lovely master bedroom is cozy, quiet and s...",Mountains! Music! Beer! Eats! Make yourself at...,none,...,1,1,0,0,0,0,0,1,0,1
8,7966916,https://www.airbnb.com/rooms/7966916,20160417215614,2016-04-18,2016-04-22,West Asheville top floor sleeps 3,Walk to restaurants and bars. Top floor of han...,Beautiful space with large windows throughout....,Walk to restaurants and bars. Top floor of han...,none,...,0,1,0,0,0,0,0,0,0,1
9,2254521,https://www.airbnb.com/rooms/2254521,20160417215614,2016-04-18,2016-04-21,The Mimosa Room,Enjoy a quaint Bungalow home in eclectic West ...,Enjoy a quaint Bungalow home in eclectic West ...,Enjoy a quaint Bungalow home in eclectic West ...,none,...,1,1,0,0,0,0,0,1,0,1


## Cancellation Policy

In [42]:
cancel_df = listings_df['cancellation_policy']

In [43]:
cancel_df.unique()

array(['moderate', 'strict', 'flexible', 'super_strict_30'], dtype=object)

In [None]:
pd.DataFrame([])