# Imports

In [2]:
#Enable matplotlib to display in jupyter notebook & import it
%matplotlib inline

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim #used in filling missing zipcodes


# Read in And Cleaning Listings.csv

In [29]:
#listings.csv READING

LISTINGS = 'data/listings.csv'

#Choose which columns from the csv to read in.
listings_cols = ['id',
                'host_id',
                'neighbourhood_cleansed',
                'zipcode',
                'latitude',
                'longitude',
                'property_type',
                'room_type',
                'accommodates',
                'bathrooms',
                'amenities',
                'price',
                'cleaning_fee',
                'number_of_reviews',
                'first_review',
                'review_scores_rating',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value',
                'calculated_host_listings_count',
                'reviews_per_month'] 

#Read in data from the csv
listings = pd.read_csv(LISTINGS, usecols=listings_cols)

#Rename any Columns as needed
rename_dict = {'id':'listing_id',
              'price':'listed_price'}
listings.rename(columns = rename_dict, inplace=True)

#use listing_id as index
listings.set_index('listing_id', inplace=True)



#############################
#         Cleaning          #
#############################

# 'zipcode' ##########
#Paste this in to zipcode section of cleaning
def latLonToZip(lat, lon):
    '''Take in a latitude and longitude and return the zipcode for that location'''
    geolocator = Nominatim()
    try:
        location = geolocator.reverse(str(lat)+','+str(lon))
        z = re.compile('(\s)([0-9]{5})(,\sUnited)')
        return z.findall(location[0])[0][1]
    except:
        print(str(lat)+','+str(lon),'-----',location)
        return np.nan
    
    print(str(lat)+','+str(lon),'-----',location)
    return np.nan
#Find all missing zippcodes : missing_zipcodes
missing_zipcodes = listings[listings.zipcode.isnull()].copy()

#update rows that are missing zipcodes using latLonToZip to fill missin
listings.zipcode.update(missing_zipcodes.apply(lambda x: latLonToZip(x['latitude'], x['longitude']), axis=1))

#Remove 'zip+4' part of any zipcode 
listings.zipcode = listings.zipcode.apply(lambda x: x[:5])

# 'price' --> 'listed_price' ##########
listings.listed_price = listings.listed_price.replace('[^0-9.]+','',regex=True).astype(float)

# 'cleaning_fee' ##########
listings.cleaning_fee = listings.cleaning_fee.replace('[^0-9.]+','',regex=True).astype(float)

# 'first_review' ##########
listings.first_review = pd.to_datetime(listings.first_review)

# 'amenities' ##########
listings.amenities = listings.amenities.replace('[^\w,\s/]+','',regex=True).apply(lambda x: x.split(','))

In [30]:
for col in listings.columns:
    print(listings[col].name, '.'*(35 - len(listings[col].name)), 'type=',type(listings[col].loc[listings[col].first_valid_index()]))

host_id ............................ type= <class 'numpy.int64'>
neighbourhood_cleansed ............. type= <class 'str'>
zipcode ............................ type= <class 'str'>
latitude ........................... type= <class 'numpy.float64'>
longitude .......................... type= <class 'numpy.float64'>
property_type ...................... type= <class 'str'>
room_type .......................... type= <class 'str'>
accommodates ....................... type= <class 'numpy.int64'>
bathrooms .......................... type= <class 'numpy.float64'>
amenities .......................... type= <class 'list'>
listed_price ....................... type= <class 'numpy.float64'>
cleaning_fee ....................... type= <class 'numpy.float64'>
number_of_reviews .................. type= <class 'numpy.int64'>
first_review ....................... type= <class 'pandas.tslib.Timestamp'>
review_scores_rating ............... type= <class 'numpy.float64'>
review_scores_accuracy ............. type

In [32]:
#Save to Pickle because it preserves the index and types
listings.to_pickle('data/listings_cleaned.pkl')

In [33]:
listings = pd.read_pickle('data/listings_cleaned.pkl')

In [34]:
for col in listings.columns:
    print(listings[col].name, '.'*(35 - len(listings[col].name)), 'type=',type(listings[col].loc[listings[col].first_valid_index()]))

host_id ............................ type= <class 'numpy.int64'>
neighbourhood_cleansed ............. type= <class 'str'>
zipcode ............................ type= <class 'str'>
latitude ........................... type= <class 'numpy.float64'>
longitude .......................... type= <class 'numpy.float64'>
property_type ...................... type= <class 'str'>
room_type .......................... type= <class 'str'>
accommodates ....................... type= <class 'numpy.int64'>
bathrooms .......................... type= <class 'numpy.float64'>
amenities .......................... type= <class 'list'>
listed_price ....................... type= <class 'numpy.float64'>
cleaning_fee ....................... type= <class 'numpy.float64'>
number_of_reviews .................. type= <class 'numpy.int64'>
first_review ....................... type= <class 'pandas.tslib.Timestamp'>
review_scores_rating ............... type= <class 'numpy.float64'>
review_scores_accuracy ............. type