# Loading Data

Load modules:

In [7]:
import pandas as pd
import numpy as np

In [8]:
from pprint import pprint as pp

Load "Asheville-listings.csv":

In [9]:
listings_df = pd.read_csv("datasets/Paris/Paris-listings.csv")

In [11]:
cols_i_dont_care = ['calculated_host_listings_count', 'calendar_last_scraped', 'calendar_updated', 'country', 'country_code', 'description', 'has_availability', 'host_about', 'host_has_profile_pic', 'host_id', 'host_location', 'host_name', 'host_neighbourhood', 'host_picture_url', 'host_response_time', 'host_thumbnail_url', 'host_url', 'jurisdiction_names', 'last_scraped', 'last_searched', 'license', 'listing_url', 'market', 'medium_url', 'name', 'neighborhood_overview', 'neighbourhood_cleansed', 'notes', 'picture_url', 'region_id', 'region_name', 'region_parent_id', 'scrape_id', 'smart_location', 'space', 'state', 'street', 'summary', 'thumbnail_url', 'transit', 'xl_picture_url', 'zipcode']
listings_df.drop(cols_i_dont_care, axis=1, inplace=True, errors='ignore')

In [12]:
listings_df.head()

Unnamed: 0,id,experiences_offered,access,interaction,house_rules,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month
0,3508970,none,Petite précision : 5ème étage sans ascenseur m...,,,2014-07-05,70%,,f,1.0,...,9.0,9.0,10.0,9.0,f,f,moderate,f,f,0.46
1,13222966,none,,,- Respect et propreté :),2014-02-12,,,f,1.0,...,10.0,10.0,9.0,10.0,f,f,flexible,f,f,0.3
2,7337128,none,,,,2013-05-15,100%,,f,1.0,...,10.0,10.0,10.0,8.0,f,f,strict,f,f,0.58
3,5764597,none,Vous aurez accès à l'ensemble du studio et bal...,,,2014-04-25,100%,,f,1.0,...,10.0,10.0,9.0,9.0,f,f,flexible,f,f,1.21
4,7861852,none,"All equipments can be used by our tenants (TV,...","We'll be glad to help you to discover Paris, a...",,2015-08-14,,,f,1.0,...,8.0,4.0,8.0,6.0,f,f,flexible,f,f,0.06


In [13]:
pp(list(listings_df.columns))

['id',
 'experiences_offered',
 'access',
 'interaction',
 'house_rules',
 'host_since',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_group_cleansed',
 'city',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'requires_lice

# Data Cleaning

## Trying to turn list of amenities into 0-1 vectors 

We gonna work on this column:

In [14]:
amenities_col = listings_df['amenities']
amenities_col.head()

0    {TV,"Cable TV",Internet,"Wireless Internet",Ki...
1    {TV,"Wireless Internet",Kitchen,Heating,"Smoke...
2    {Internet,Kitchen,"Smoking allowed","Buzzer/wi...
3    {TV,Internet,"Wireless Internet","Wheelchair a...
4    {TV,Internet,"Wireless Internet",Kitchen,"Smok...
Name: amenities, dtype: object

### Using the CSV Method

Data are of string type, looks like CSV string:

In [15]:
string = amenities_col[1][1:-1]
print string

TV,"Wireless Internet",Kitchen,Heating,"Smoke detector","Safety card",Essentials,"translation missing: en.hosting_amenity_49","translation missing: en.hosting_amenity_50"


Load csv module then:

In [16]:
import csv

In [17]:
for row in csv.reader([string]):
    print ' | '.join(row)

TV | Wireless Internet | Kitchen | Heating | Smoke detector | Safety card | Essentials | translation missing: en.hosting_amenity_49 | translation missing: en.hosting_amenity_50


It works! Now let's do it on all rows:

In [18]:
# first, get rid of all the "{}"s:
if amenities_col[0].startswith('{'):
    amenities_col = amenities_col.str.slice(1,-1)

In [19]:
amenities_lists = [row for row in csv.reader(amenities_col)]

In [20]:
amenities_lists

[['TV',
  'Cable TV',
  'Internet',
  'Wireless Internet',
  'Kitchen',
  'Buzzer/wireless intercom',
  'Heating',
  'Family/kid friendly',
  'Washer',
  'Dryer',
  'Essentials',
  'Laptop friendly workspace'],
 ['TV',
  'Wireless Internet',
  'Kitchen',
  'Heating',
  'Smoke detector',
  'Safety card',
  'Essentials',
  'translation missing: en.hosting_amenity_49',
  'translation missing: en.hosting_amenity_50'],
 ['Internet',
  'Kitchen',
  'Smoking allowed',
  'Buzzer/wireless intercom',
  'Heating',
  'Family/kid friendly',
  'Washer',
  'Smoke detector',
  'Essentials',
  'translation missing: en.hosting_amenity_49'],
 ['TV',
  'Internet',
  'Wireless Internet',
  'Wheelchair accessible',
  'Kitchen',
  'Smoking allowed',
  'Elevator in building',
  'Heating',
  'Family/kid friendly',
  'Washer',
  'Smoke detector',
  'Essentials',
  'Shampoo'],
 ['TV',
  'Internet',
  'Wireless Internet',
  'Kitchen',
  'Smoking allowed',
  'Heating',
  'Washer',
  'Essentials'],
 ['TV',
  'Wirel

### Using the CountVectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
vectorizer = CountVectorizer(token_pattern = ur'(?!,|$)(.+?)(?=,|$)')

In [23]:
# first, get rid of all the '"'s:
amenities_col = amenities_col.str.replace('"', '')

In [24]:
amenities_col.head()

0    TV,Cable TV,Internet,Wireless Internet,Kitchen...
1    TV,Wireless Internet,Kitchen,Heating,Smoke det...
2    Internet,Kitchen,Smoking allowed,Buzzer/wirele...
3    TV,Internet,Wireless Internet,Wheelchair acces...
4    TV,Internet,Wireless Internet,Kitchen,Smoking ...
Name: amenities, dtype: object

Now actually transform the data:

In [25]:
X = vectorizer.fit_transform(amenities_col)

In [26]:
labels = vectorizer.get_feature_names()

Check that the inverse should work:

In [27]:
vectorizer.inverse_transform(X)[:3]

[array([u'laptop friendly workspace', u'essentials', u'dryer', u'washer',
        u'family/kid friendly', u'heating', u'buzzer/wireless intercom',
        u'kitchen', u'wireless internet', u'internet', u'cable tv', u'tv'], 
       dtype='<U42'), array([u'translation missing: en.hosting_amenity_50',
        u'translation missing: en.hosting_amenity_49', u'safety card',
        u'smoke detector', u'essentials', u'heating', u'kitchen',
        u'wireless internet', u'tv'], 
       dtype='<U42'), array([u'smoking allowed', u'translation missing: en.hosting_amenity_49',
        u'smoke detector', u'essentials', u'washer', u'family/kid friendly',
        u'heating', u'buzzer/wireless intercom', u'kitchen', u'internet'], 
       dtype='<U42')]

Great. We can save the data now:

In [28]:
# from scipy.sparse import save_npz
# save_npz('datasets/Asheville/Asheville-listings-amenities.npz', X)

In [29]:
X

<56535x69 sparse matrix of type '<type 'numpy.int64'>'
	with 742084 stored elements in Compressed Sparse Row format>

In [30]:
#!pip install --upgrade scipy

Data in this sparse matrix look like:

In [31]:
vectors = X.todense()
vectors

matrix([[0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1]])

In [32]:
listings_df.drop('amenities', axis=1, inplace=True)

In [33]:
vectors_df = pd.DataFrame(vectors, columns = ['AMN_'+label for label in labels])

In [34]:
listings_AMN_df = pd.concat([listings_df, vectors_df], axis=1)

In [35]:
# listings_AMN_df.to_csv('datasets/Asheville/Asheville-listings-with-amenities.csv')

In [36]:
listings_AMN_df

Unnamed: 0,id,experiences_offered,access,interaction,house_rules,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,AMN_suitable for events,AMN_table corner guards,AMN_translation missing: en.hosting_amenity_49,AMN_translation missing: en.hosting_amenity_50,AMN_tv,AMN_washer,AMN_washer / dryer,AMN_wheelchair accessible,AMN_window guards,AMN_wireless internet
0,3508970,none,Petite précision : 5ème étage sans ascenseur m...,,,2014-07-05,70%,,f,1.0,...,0,0,0,0,1,1,0,0,0,1
1,13222966,none,,,- Respect et propreté :),2014-02-12,,,f,1.0,...,0,0,1,1,1,0,0,0,0,1
2,7337128,none,,,,2013-05-15,100%,,f,1.0,...,0,0,1,0,0,1,0,0,0,0
3,5764597,none,Vous aurez accès à l'ensemble du studio et bal...,,,2014-04-25,100%,,f,1.0,...,0,0,0,0,1,1,0,1,0,1
4,7861852,none,"All equipments can be used by our tenants (TV,...","We'll be glad to help you to discover Paris, a...",,2015-08-14,,,f,1.0,...,0,0,0,0,1,1,0,0,0,1
5,14272211,none,,,Smoking is allowed only on the window in the l...,2013-11-03,100%,,f,2.0,...,0,0,1,1,1,1,0,0,0,1
6,12830452,none,Vous trouverez tout le confort nécessaire pour...,Les clés vous seront remises par la gardienne ...,De préférence non-fumeur car un bébé vit habit...,2013-01-12,,,f,1.0,...,0,0,0,0,1,1,0,0,0,1
7,4585376,none,,,No Smoking. No pets.,2014-06-24,100%,,f,1.0,...,0,0,1,1,1,1,0,0,0,1
8,11139002,none,,,,2014-09-02,100%,,f,1.0,...,0,0,0,0,1,0,0,0,0,1
9,5765171,none,accès à toutes les pièces et équipements,,animaux non autorisés appartement non-fumeur (...,2014-08-06,100%,,f,1.0,...,0,0,1,0,1,1,0,0,0,1


## Cancellation Policy

Load data:

In [37]:
cancel_df = listings_df['cancellation_policy']

Get all unique possible policy names:

In [48]:
policies = cancel_df.unique().tolist()
assert all([policy in ('moderate', 'strict', 'flexible', 'super_strict_30', 'super_strict_60') for policy in policies])

In [49]:
policies

['moderate', 'flexible', 'strict', 'super_strict_60', 'super_strict_30']

create mapping from policy name to an integer (manually assigned):

In [50]:
policy_converter = dict(zip(policies, [2,3,1,4, 5]))
policy_converter

{'flexible': 3,
 'moderate': 2,
 'strict': 1,
 'super_strict_30': 5,
 'super_strict_60': 4}

In [51]:
cancel_df.replace(policy_converter).head()

0    2
1    3
2    1
3    3
4    3
Name: cancellation_policy, dtype: int64

This works. Now let's apply this to our fullest dataframe:

In [52]:
listings_AMN_CCL_df = listings_AMN_df.replace({'cancellation_policy': policy_converter})

Save:

In [53]:
# listings_AMN_CCL_df.to_csv('datasets/Asheville/Asheville-listings-with-amenities-cancelpolicy.csv')

## Typed Features

First of all:

In [54]:
listings_latest_df = listings_AMN_CCL_df

For columns whose possible values are really comparable, we can convert them into a scale:

In [55]:
col_name = 'room_type' # configure this!
possible_types = listings_latest_df[col_name].unique().tolist()
print possible_types
assert possible_types == ['Entire home/apt', 'Private room', 'Shared room'] # modify this!
converter = dict(zip(possible_types, [3, 2, 1])) # modify this!
listings_latest_df = listings_latest_df.replace({col_name: converter})
pp(converter)

['Entire home/apt', 'Private room', 'Shared room']
{'Entire home/apt': 3, 'Private room': 2, 'Shared room': 1}


For those not really so, we have to expand those columns:

In [56]:
from sklearn.preprocessing import LabelBinarizer

In [57]:
col_name           = 'bed_type' # configure this!
binarizer          = LabelBinarizer()
try:
    X              = binarizer.fit_transform(listings_latest_df[col_name])
except KeyError:
    print 'Error: Column already expanded. Skipping.'
else:
    col_labels         = ['BED='+i for i in binarizer.classes_]
    X_df               = pd.DataFrame(X, columns=col_labels)
    listings_latest_df.drop(col_name, axis=1, inplace=True) # drop the old column, by name
    listings_latest_df = pd.concat([listings_latest_df, X_df], axis=1) # attach the expanded features
    print 'Expanded', col_name, 'into', col_labels,'.'

Expanded bed_type into ['BED=Airbed', 'BED=Couch', 'BED=Futon', 'BED=Pull-out Sofa', 'BED=Real Bed'] .


Funny that listings providing something other than real beds are soooo rare:

In [58]:
X_df[X_df['BED=Real Bed']==0]

Unnamed: 0,BED=Airbed,BED=Couch,BED=Futon,BED=Pull-out Sofa,BED=Real Bed
3,0,1,0,0,0
8,0,1,0,0,0
10,0,0,0,1,0
18,0,0,0,1,0
19,0,0,0,1,0
28,0,0,0,1,0
32,0,1,0,0,0
49,0,0,0,1,0
51,0,0,0,1,0
59,0,1,0,0,0


In [59]:
col_name           = 'property_type' # configure this!
binarizer          = LabelBinarizer()
try:
    X              = binarizer.fit_transform(listings_latest_df[col_name])
except KeyError:
    print 'Error: Column already expanded. Skipping.'
else:
    col_labels         = ['POPTY='+i for i in binarizer.classes_]
    X_df               = pd.DataFrame(X, columns=col_labels)
    listings_latest_df.drop(col_name, axis=1, inplace=True) # drop the old column, by name
    listings_latest_df = pd.concat([listings_latest_df, X_df], axis=1) # attach the expanded features
    print 'Expanded', col_name, 'into', col_labels,'.'

Expanded property_type into ['POPTY=Apartment', 'POPTY=Bed & Breakfast', 'POPTY=Boat', 'POPTY=Boutique hotel', 'POPTY=Bungalow', 'POPTY=Cabin', 'POPTY=Camper/RV', 'POPTY=Cave', 'POPTY=Chalet', 'POPTY=Condominium', 'POPTY=Dorm', 'POPTY=Earth House', 'POPTY=Guesthouse', 'POPTY=Hostel', 'POPTY=House', 'POPTY=Igloo', 'POPTY=Loft', 'POPTY=Other', 'POPTY=Serviced apartment', 'POPTY=Timeshare', 'POPTY=Tipi', 'POPTY=Townhouse', 'POPTY=Treehouse', 'POPTY=Villa'] .


Save to file:

In [61]:
listings_latest_df.to_csv('datasets/Paris/Paris-processed.csv', encoding='utf-8')

### Todo
These are the columns still needs attention:

In [62]:
for ii, i in enumerate(listings_latest_df.columns):
    if not i.startswith('AMN') and not i.startswith('POPTY=') and not i.startswith('BED=') and i not in [
        'room_type', 'id']:
        print ii, i

1 experiences_offered
2 access
3 interaction
4 house_rules
5 host_since
6 host_response_rate
7 host_acceptance_rate
8 host_is_superhost
9 host_listings_count
10 host_total_listings_count
11 host_verifications
12 host_identity_verified
13 neighbourhood
14 neighbourhood_group_cleansed
15 city
16 latitude
17 longitude
18 is_location_exact
20 accommodates
21 bathrooms
22 bedrooms
23 beds
24 square_feet
25 price
26 weekly_price
27 monthly_price
28 security_deposit
29 cleaning_fee
30 guests_included
31 extra_people
32 minimum_nights
33 maximum_nights
34 availability_30
35 availability_60
36 availability_90
37 availability_365
38 number_of_reviews
39 first_review
40 last_review
41 review_scores_rating
42 review_scores_accuracy
43 review_scores_cleanliness
44 review_scores_checkin
45 review_scores_communication
46 review_scores_location
47 review_scores_value
48 requires_license
49 instant_bookable
50 cancellation_policy
51 require_guest_profile_picture
52 require_guest_phone_verification
53 r