## Tips and tricks

### Filter before merge

In [2]:
import pandas as pd
import numpy as np
reviews = pd.read_csv('data/reviews.csv')
reviews_summary = pd.read_csv('data/reviews_summary.csv')

In [3]:
listings = pd.read_csv('data/listings.csv')

In [4]:
listings = listings.rename(columns={'id':'listing_id'})

In [5]:
listings = listings.sample(frac=0.2, random_state=1337)

In [5]:
%%timeit
listings.merge(reviews_summary, on='listing_id')

86.6 ms ± 805 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
reviews_subset = reviews_summary[reviews_summary['listing_id'].isin(listings['listing_id'])]
listings.merge(reviews_subset, on='listing_id')

48.7 ms ± 618 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Vectorize operations

### Normalize prize between 0 and 1

In [30]:
listings.loc[listings['price']==0, 'price'] = 30

In [31]:
# Iterative
min_prize = listings['price'].min()
max_prize = listings['price'].max()

In [33]:
max_prize, min_prize

(8600, 9)

In [40]:
%%timeit
norm_prizes = np.zeros(len(listings,))
for i in range(len(listings)):
    norm_prizes[i] = (listings.iloc[i]['price'] - min_prize) / (max_prize - min_prize)
listings['norm_price'] = norm_prizes

1.08 s ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
%%timeit 
listings['norm_price'] = listings['price'].map(lambda x: (x - min_prize) / (max_prize - min_prize))

3.85 ms ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [41]:
%%timeit
listings['norm_price'] = (listings['price'] - min_prize) / (max_prize - min_prize)

565 µs ± 6.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Normalize price per neighbourhood group

In [62]:
grouped_prices_max = listings.groupby('neighbourhood_group', as_index=False).max()
grouped_prices_max = grouped_prices_max.rename(columns={'price':'max_price'})[['neighbourhood_group', 'max_price']]

In [63]:
grouped_prices_min = listings.groupby('neighbourhood_group', as_index=False).min()
grouped_prices_min = grouped_prices_min.rename(columns={'price':'min_price'})[['neighbourhood_group', 'min_price']]

In [64]:
listings_groups = listings.merge(grouped_prices_max, on='neighbourhood_group')
listings_groups = listings_groups.merge(grouped_prices_min, on='neighbourhood_group')

In [65]:
listings_groups['norm_price'] = (listings_groups['price'] - listings_groups['min_price']) / (listings_groups['max_price'] - listings_groups['min_price'])

## using numpy

Lets say we want to do more advanced calculations for a subset of our data, then it can be faster to convert the data to numpy arrays and loop through them.

In [16]:
room_type_scores = {'Entire home/apt': 1,
                   'Private room': 0.7,
                   'Shared room': 0.2}

In [36]:
%%timeit
prizes = listings['price'].values
nr_reviews = listings['number_of_reviews'].values
availability = listings['availability_365'].values
minimum_nights = listings['minimum_nights'].values
room_types = listings['room_type'].values
scores = np.zeros(len(prizes))
for i in range(len(prizes)):
    if availability[i] == 0:
        scores[i] = 0
    elif prizes[i] > 100:
        scores[i] = 0
    elif minimum_nights[i] > 3:
        scores[i] = 0
    elif nr_reviews[i] < 10:
        scores[i] = 0
    else:
        room_type_score = room_type_scores[room_types[i]]
        prize_score = (100 - prizes[i]) / 100
        review_score = 1 if nr_reviews[i] > 50 else 0.5
        scores[i] = room_type_score * prize_score * review_score
listings['score'] = scores


6.11 ms ± 79.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [42]:
%%timeit
scores = np.zeros(len(prizes))
for i in range(len(prizes)):
    row = listings.iloc[i]
    if row['availability_365'] == 0:
        scores[i] = 0
    elif row['price'] > 100:
        scores[i] = 0
    elif row['minimum_nights'] > 3:
        scores[i] = 0
    elif row['number_of_reviews'] < 10:
        scores[i] = 0
    else:
        room_type_score = room_type_scores[row['room_type']]
        prize_score = (100 - row['price']) / 100
        review_score = 1 if row['number_of_reviews'] > 50 else 0.5
        scores[i] = room_type_score * prize_score * review_score
listings['score'] = scores


1.19 s ± 18.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
%%timeit 
listings['room_type_score'] = listings['room_type'].map(lambda x: room_type_scores[x])
listings['prize_score'] = (100 - listings['price']) / 100
listings['review_score'] = listings['number_of_reviews'].map(lambda x: 1 if x > 50 else 0.5)
listings['score'] = listings['room_type_score'] * listings['prize_score'] * listings['review_score']
listings.loc[(listings['availability_365'] == 0) | (listings['price'] > 100) | (listings['minimum_nights'] > 3) | (listings['number_of_reviews'] < 10), 'score'] = 0

9.11 ms ± 84.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
def room_score(availability, price, min_nights, nr_reviews, room_type):
    if availability== 0:
        return 0
    elif price > 100:
        return 0
    elif min_nights > 3:
        return 0
    elif nr_reviews < 10:
        return 0
    else:
        room_type_score = room_type_scores[room_type]
        prize_score = (100 - price) / 100
        review_score = 1 if nr_reviews > 50 else 0.5
        return room_type_score * prize_score * review_score
    

In [47]:
%%timeit
listings['score'] = listings.apply(lambda row: room_score(row['availability_365'], row['price'], row['minimum_nights'], row['number_of_reviews'], row['room_type']), axis=1)


217 ms ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
listings.sort_values('score', ascending=False)

Unnamed: 0,listing_id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,score,room_type_score,prize_score,review_score
3342,1129012,Sunny apartment no.2 in Berlin city,915430,Andreas,Reinickendorf,Ost 1,52.561471,13.372684,Entire home/apt,18,2,186,2018-11-03,2.91,2,73,0.820,1.0,0.82,1.0
2241,22154155,Tiny House in Berlin-Weissensee,161792772,René,Pankow,Weißensee Ost,52.558733,13.475007,Entire home/apt,35,1,56,2018-10-30,7.74,1,205,0.650,1.0,0.65,1.0
4359,323260,Apartment in Berlin for 1-3 persons,1654885,Claudius,Neukölln,Neuköllner Mitte/Zentrum,52.483888,13.428707,Entire home/apt,36,3,197,2018-10-27,2.56,11,248,0.640,1.0,0.64,1.0
3812,18561176,Cozy Room in comfortable 80qm flat,59073111,Konstantin,Reinickendorf,West 4,52.565287,13.325170,Private room,10,1,82,2018-10-30,4.60,3,16,0.630,0.7,0.90,1.0
3681,18069548,Vacation Rental at Charlotte's in Berlin,91074978,Charlotte,Tempelhof - Schöneberg,Friedenau,52.473107,13.344343,Entire home/apt,38,2,53,2018-11-04,2.89,1,1,0.620,1.0,0.62,1.0
1834,12281381,Ideal Room for Couples Close to the City Center,49815862,Jacob,Lichtenberg,Alt-Hohenschönhausen Süd,52.535306,13.475582,Private room,14,3,153,2018-11-01,5.04,2,58,0.602,0.7,0.86,1.0
4412,10941636,Family-friendly + near to the party,56732023,Robert & Steffi,Friedrichshain-Kreuzberg,Frankfurter Allee Nord,52.518563,13.468847,Entire home/apt,40,1,125,2018-10-19,3.76,1,10,0.600,1.0,0.60,1.0
2743,11383656,Cozy studio in Prenzlauer Berg 2,59676944,Elena,Pankow,Prenzlauer Berg Süd,52.541630,13.425360,Entire home/apt,40,2,125,2018-10-21,3.97,2,121,0.600,1.0,0.60,1.0
794,11381321,Cozy studio in Prenzlauer Berg,59676944,Elena,Pankow,Helmholtzplatz,52.543037,13.423437,Entire home/apt,40,2,121,2018-10-23,3.75,2,136,0.600,1.0,0.60,1.0
1907,17688227,primeflats - City Apartment Prenzlauer Berg 7,1625771,Ben,Pankow,Prenzlauer Berg Nordwest,52.550908,13.404553,Entire home/apt,42,1,60,2018-11-02,3.14,45,121,0.580,1.0,0.58,1.0


In [11]:
listings.groupby('room_type').count()

Unnamed: 0_level_0,listing_id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Entire home/apt,2175,2171,2175,2171,2175,2175,2175,2175,2175,2175,2175,1815,1814,2175,2175
Private room,2281,2278,2281,2280,2281,2281,2281,2281,2281,2281,2281,1890,1888,2281,2281
Shared room,54,54,54,54,54,54,54,54,54,54,54,40,40,54,54


## Indexing

In [None]:
blabla = 0

## Memory efficiency