In [1]:
import datetime
import itertools
import json
import operator
import os
import pandas as pd
import pprint
import numpy as np
import re
import spacy
import torch

from collections import Counter, deque

from utils.data import load_training_test_data
from utils.dataframe import (
    categories_from_column,
    column_list_to_category_flags,
    count_ngrams_up_to_n,
    normalize_categories,
    remap_date_column_to_days_before,
    remap_to_float,
    remove_small_or_stopwords_from_ranking
)

nlp = spacy.load("en")

Check GPU support

In [2]:
torch.cuda.is_available()

True

Load data

In [3]:
(TRAIN_DATAFRAME, TEST_DATAFRAME) = \
  load_training_test_data(os.path.join('data', 'train.json'),
                          os.path.join('data', 'test.json'))

Let's see what this table looks like. We'll display the head of the table which shows its features

In [4]:
TRAIN_DATAFRAME.head()

Unnamed: 0,id,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
0,4,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[Dining Room, Pre-War, Laundry in Building, Di...",40.7108,7170325,-73.9539,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/7170325_3bb5ac84...,2400,145 Borinquen Place,medium
1,6,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7513,7092344,-73.9722,955db33477af4f40004820b4aed804a0,[https://photos.renthop.com/2/7092344_7663c19a...,3800,230 East 44th,low
2,9,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[Doorman, Elevator, Laundry in Building, Laund...",40.7575,7158677,-73.9625,c8b10a317b766204f08e613cef4ce7a0,[https://photos.renthop.com/2/7158677_c897a134...,3495,405 East 56th Street,medium
3,10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
4,15,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[Doorman, Elevator, Fitness Center, Laundry in...",40.7439,7225292,-73.9743,2c3b41f588fbb5234d8a1e885a436cfa,[https://photos.renthop.com/2/7225292_901f1984...,2795,340 East 34th Street,low


Let's clean up the categories and put them into a sensible vector. Unfortunately the categories are a bit of a mess - since the user can specify what categories they want there isn't much in the way of consistency between categories.

Some of the patterns that we frequently see in the categories are:
 - Separating category names with "**"
 - Mix of caps/nocaps
 - Some common themes, such as:
   - "pets"
   - "office"
   - "living room"
   - "garden"
   - "common area"
   - "storage"
   - "no pets"
   - "parking"
   - "bicycle"
   - "doorman"
   - etc

To deal with this, lets pull out all of the categories and normalize them
by removing excess punctuation, normalizing for whitespace, lowercasing, and counting for certain n-grams.

In [5]:
normalized_categories = normalize_categories(categories_from_column(TRAIN_DATAFRAME, 'features'))
normalized_categories

['30',
 'private entrances',
 'small dogs ok',
 'highrise',
 'washer in unit',
 'private balcony',
 'private terrace',
 'the most sought after location',
 'bluetooth',
 'valet',
 'separate entryway',
 'recessed lighting',
 'air condition',
 'cold storage in lobby',
 'renovated bathroom',
 'one month free rent',
 'newly renovated',
 'playroom',
 'two closets',
 'heart of the village massive 3br super share mr clean approved ornate prewar details gourmet kitchen tons of sunlight',
 'market on site',
 'holy deal batman oversized 3br mansion 2 full baths huge scenic terrace doorman elev/lndry bldg roof deck real pix',
 'fios high speed internet',
 'on site laundry',
 'oak floors',
 'sprawling 2br super share gut renovated clean brite 1/2 blk to bedford l',
 'terrace',
 'laundry in unit',
 'wood burning fireplace',
 'roof/sundeck',
 'huge living room',
 'near all trains',
 'flex 3',
 'package room',
 'ss appliances',
 'new stainless appliances',
 '$600 gift card for move ins prior to june 1

Now that we have our slightly tidied up categories, we can create some n-grams and count their frequency

In [6]:
most_common_ngrams = count_ngrams_up_to_n(" ".join(normalized_categories), 3).most_common()
most_common_ngrams

[('', 6421),
 ('in', 122),
 ('kitchen', 121),
 ('to', 90),
 ('room', 85),
 ('home', 65),
 ('of', 63),
 ('fee', 62),
 ('and', 59),
 ('the', 57),
 ('no', 57),
 ('private', 56),
 ('super', 56),
 ('2br', 54),
 ('new', 50),
 ('on', 49),
 ('massive', 48),
 ('2', 48),
 ('windows', 45),
 ('share', 44),
 ('sprawling', 44),
 ('storage', 42),
 ('park', 42),
 ('no fee', 42),
 ('pets', 41),
 ('ok', 40),
 ('bldg', 39),
 ('deck', 38),
 ('laundry', 37),
 ('l', 37),
 ('all', 35),
 ('huge', 33),
 ('renovated', 32),
 ('oversized', 32),
 ('roof', 32),
 ('parking', 32),
 ('eat', 32),
 ('eat in', 32),
 ('in kitchen', 32),
 ('free', 31),
 ('3br', 31),
 ('blks', 31),
 ('blks to', 31),
 ('site', 30),
 ('on site', 30),
 ('doorman', 29),
 ('with', 29),
 ('super share', 29),
 ('closets', 28),
 ('1br', 28),
 ('full', 27),
 ('included', 27),
 ('large', 27),
 ('studio', 27),
 ('steps', 27),
 ('bedford', 26),
 ('lounge', 26),
 ('floor', 26),
 ('bedford l', 26),
 ('pets ok', 26),
 ('steps to', 26),
 ('terrace', 25),
 

There's quite a few words here that don't add much value. We can remove them by consulting a list of stopwords

In [7]:
most_common_ngrams = list(remove_small_or_stopwords_from_ranking(most_common_ngrams, nlp, 3))
most_common_ngrams

[('kitchen', 121),
 ('room', 85),
 ('home', 65),
 ('fee', 62),
 ('private', 56),
 ('super', 56),
 ('2br', 54),
 ('new', 50),
 ('massive', 48),
 ('windows', 45),
 ('share', 44),
 ('sprawling', 44),
 ('storage', 42),
 ('park', 42),
 ('no fee', 42),
 ('pets', 41),
 ('bldg', 39),
 ('deck', 38),
 ('laundry', 37),
 ('huge', 33),
 ('renovated', 32),
 ('oversized', 32),
 ('roof', 32),
 ('parking', 32),
 ('eat', 32),
 ('eat in', 32),
 ('in kitchen', 32),
 ('free', 31),
 ('3br', 31),
 ('blks', 31),
 ('blks to', 31),
 ('site', 30),
 ('on site', 30),
 ('doorman', 29),
 ('super share', 29),
 ('closets', 28),
 ('1br', 28),
 ('included', 27),
 ('large', 27),
 ('studio', 27),
 ('steps', 27),
 ('bedford', 26),
 ('lounge', 26),
 ('floor', 26),
 ('bedford l', 26),
 ('pets ok', 26),
 ('steps to', 26),
 ('terrace', 25),
 ('clean', 25),
 ('train', 25),
 ('unit', 24),
 ('appliances', 24),
 ('space', 24),
 ('roof deck', 24),
 ('2 blks', 24),
 ('month', 23),
 ('brick', 23),
 ('outdoor', 23),
 ('service', 23),


Now that we have these, we can probably take 100 most common and arrange
them into category flags for our table

In [8]:
TRAIN_DATAFRAME = column_list_to_category_flags(TRAIN_DATAFRAME, 'features', list(map(operator.itemgetter(0), most_common_ngrams[:100])))
TEST_DATAFRAME = column_list_to_category_flags(TEST_DATAFRAME, 'features', list(map(operator.itemgetter(0), most_common_ngrams[:100])))

In [9]:
TRAIN_DATAFRAME.head(200)

Unnamed: 0,id,bathrooms,bedrooms,building_id,created,description,display_address,latitude,listing_id,longitude,...,features_stop,features_brand,features_concierge,features_in_unit,features_l_stop,features_washer,features_heart,features_village,features_details,features_elev/lndry
0,4,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,40.7108,7170325,-73.9539,...,0,0,0,0,0,1,0,0,0,0
1,6,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,40.7513,7092344,-73.9722,...,0,0,0,0,0,1,0,0,0,0
2,9,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,40.7575,7158677,-73.9625,...,0,0,0,1,0,1,0,0,0,0
3,10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,7211212,-73.9425,...,0,0,0,0,0,0,0,0,0,0
4,15,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,40.7439,7225292,-73.9743,...,0,0,0,0,0,0,0,0,0,0
5,16,1.0,3,300d27d8ba2adbcbc8c6f2bcbc1c6f9d,2016-06-28 05:59:06,This spectacular converted 3 bed apartment all...,East 16th Street,40.7348,7226687,-73.9865,...,0,0,0,0,0,1,0,0,0,0
6,18,2.0,3,0d01cabe55fa5192cdbcabd5c585c1ea,2016-06-08 06:21:36,AMAZING DEAL!! BRAND NEW RENOVATIONS IN THIS H...,East 13th Street,40.7302,7126989,-73.9826,...,0,0,0,1,0,1,0,0,0,0
7,19,1.0,0,d48767c37a934daaf0bbb0e58c755d0c,2016-06-05 05:28:22,No Fee Large Renovated Sun Splashed Studio. Wa...,York Avenue,40.7769,7114138,-73.9467,...,0,0,0,0,0,1,0,0,0,0
8,23,0.0,1,d1ca33a2853e64fad6e4009d5d5d168f,2016-06-09 04:42:03,Extra large one bedroom apartment located in P...,E 19 Street,40.7346,7131094,-73.9811,...,0,0,0,0,0,0,0,0,0,0
9,32,3.0,3,5f35dc2f0191baf109221752e6ee0c48,2016-06-28 03:26:18,Listed: 06/26/16<br /><br />Available:...,Hicks Street,40.6990,7224815,-73.9943,...,0,0,0,0,0,0,0,0,0,0
