In [34]:
import numpy
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

In [35]:
df = pd.read_json('data-large/train.json')


In [36]:
df.shape


(49352, 15)

In [37]:
df.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address'],
      dtype='object')

In [38]:
Y = df['interest_level']


In [39]:
Y.head(10)


10        medium
10000        low
100004      high
100007       low
100013       low
100014    medium
100016       low
100020       low
100026    medium
100027       low
Name: interest_level, dtype: object

In [40]:
df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [41]:
exclude_columns = ['building_id', 'manager_id', 'listing_id']
df.columns.difference(exclude_columns)

Index(['bathrooms', 'bedrooms', 'created', 'description', 'display_address',
       'features', 'interest_level', 'latitude', 'longitude', 'photos',
       'price', 'street_address'],
      dtype='object')

In [42]:
#is description empty
#does it contain features; how many features
#sentiment analysis of description - just look at description and interest_level
#address and avg household income?
#treat features like sentiment analysis
#month into a column


In [43]:
df[0:1]
df.describe()

Unnamed: 0,bathrooms,bedrooms,latitude,listing_id,longitude,price
count,49352.0,49352.0,49352.0,49352.0,49352.0,49352.0
mean,1.21218,1.54164,40.741545,7024055.0,-73.955716,3830.174
std,0.50142,1.115018,0.638535,126274.6,1.177912,22066.87
min,0.0,0.0,0.0,6811957.0,-118.271,43.0
25%,1.0,1.0,40.7283,6915888.0,-73.9917,2500.0
50%,1.0,1.0,40.7518,7021070.0,-73.9779,3150.0
75%,1.0,2.0,40.7743,7128733.0,-73.9548,4100.0
max,10.0,8.0,44.8835,7753784.0,0.0,4490000.0


In [44]:
df.dtypes

bathrooms          float64
bedrooms             int64
building_id         object
created             object
description         object
display_address     object
features            object
interest_level      object
latitude           float64
listing_id           int64
longitude          float64
manager_id          object
photos              object
price                int64
street_address      object
dtype: object

In [45]:
dummy_fields = ['interest_level']
for each in dummy_fields:
    dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
    df = pd.concat([df, dummies], axis=1)

In [46]:
df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level_high,interest_level_low,interest_level_medium
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,0,0,1
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,0,1,0
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,1,0,0
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,1,0
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,0,1,0


In [47]:
#99964
#df.loc[df['bathrooms'] == 0 or df['bedrooms'] == 0]

df[(df['bathrooms'] < 1) | (df['bedrooms'] < 1 )].shape
#df[(df['bathrooms'] < 1 )].shape

(9631, 18)

In [48]:
9631/49352 #19% don't have the

0.19514913276057708

In [49]:
#df['bathrooms'].mean

In [50]:
d = pd.DataFrame(numpy.random.rand(10, 2) * 10, columns=['Price', 'Qty'])

In [51]:
import random
seasons = ['Spring', 'Summer', 'Fall', 'Winter']
d['Season'] = numpy.random.randint(0, len(seasons), size=(d.shape[0], 1)) #random.randint(1, 5+1)

In [52]:
#d['x'] = seasons[numpy.random.randint(0, len(seasons), size=(d.shape[0], 1)) ]

numpy.random.randint(0, len(seasons), size=(d.shape[0], 1))

array([[2],
       [1],
       [1],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [2]])

In [53]:
dx = pd.DataFrame(numpy.random.randint(0,5,size=(10, 1)), columns=list('s'))
dx

Unnamed: 0,s
0,2
1,4
2,2
3,1
4,2
5,1
6,2
7,3
8,0
9,3


In [54]:
numpy.random.randint(0, len(seasons), size=(d.shape[0], 1)) 

array([[1],
       [0],
       [0],
       [2],
       [3],
       [1],
       [1],
       [3],
       [1],
       [2]])

In [55]:
#s = Series.

In [56]:
d

Unnamed: 0,Price,Qty,Season
0,0.796313,9.543787,2
1,4.955938,2.495642,2
2,3.310384,7.252741,3
3,3.926515,0.85547,3
4,4.223971,5.031967,0
5,2.668208,8.105054,3
6,5.236772,2.411572,0
7,4.638191,1.772304,0
8,4.836395,9.453967,3
9,1.744532,0.400723,3


In [57]:
#d['Season'] = seasons[d['Season']]
f = lambda x: seasons[x]
#d['Season'].applymap(f)
d['Season'] = d['Season'].apply(lambda x: seasons[x])

In [58]:
d

Unnamed: 0,Price,Qty,Season
0,0.796313,9.543787,Fall
1,4.955938,2.495642,Fall
2,3.310384,7.252741,Winter
3,3.926515,0.85547,Winter
4,4.223971,5.031967,Spring
5,2.668208,8.105054,Winter
6,5.236772,2.411572,Spring
7,4.638191,1.772304,Spring
8,4.836395,9.453967,Winter
9,1.744532,0.400723,Winter


In [59]:
d['xx'] = [random.choice(('Chicago', 'Boston', 'New York')) for i in range(d.shape[0])]

In [60]:
d

Unnamed: 0,Price,Qty,Season,xx
0,0.796313,9.543787,Fall,Chicago
1,4.955938,2.495642,Fall,Chicago
2,3.310384,7.252741,Winter,Chicago
3,3.926515,0.85547,Winter,Boston
4,4.223971,5.031967,Spring,New York
5,2.668208,8.105054,Winter,New York
6,5.236772,2.411572,Spring,New York
7,4.638191,1.772304,Spring,Chicago
8,4.836395,9.453967,Winter,Boston
9,1.744532,0.400723,Winter,New York


In [72]:
import numpy as np



In [70]:
weights_0_1 = np.random.randn(10,5) #10 arrays of 5 random values

In [71]:
weights_0_1

array([[-2.77462513,  1.02034699,  0.56669858, -0.26889056, -1.47959984],
       [-0.41158734, -1.18840828,  0.18876167, -0.57339075, -1.7276252 ],
       [-0.92385138,  0.50297791,  0.15459449, -2.26638278,  0.32955635],
       [-0.30578929, -0.20204188,  1.93315151,  0.90591694, -0.66658223],
       [-0.2356135 ,  1.13720593,  1.9072593 ,  0.94571867, -1.13701912],
       [ 0.78794949, -0.45007358,  1.66842056, -1.51341868,  0.00966682],
       [-0.7138877 , -0.66253821, -1.3132165 ,  0.89336464,  0.21249147],
       [-0.31251722, -0.46058409, -1.4457165 ,  1.21942338, -1.68845252],
       [ 1.59071348, -1.97946488,  0.32429305,  0.89583226,  0.61649488],
       [-0.0131918 , -0.36063389,  1.3933109 ,  0.46950366, -0.54590025]])

In [68]:
layer_0 = np.zeros(10) #array with 10 elements of 0

In [69]:
layer_0.dot(weights_0_1) #took the array of zeroes and multiplied it by

array([ 0.,  0.,  0.,  0.,  0.])

In [73]:
ones = np.array([1,1,1,1,1,1,1,1,1,1])

In [74]:
ones.dot(weights_0_1)

array([-3.3124004 , -2.64321397,  5.37755706,  0.70767678, -6.07696964])

In [79]:
t1 = np.array([[1,2],[3,4]])

In [80]:
t2 = np.array([[-1,-2,-5],[-3,-4,-6]])

In [81]:
t1.dot(t2)

array([[ -7, -10, -17],
       [-15, -22, -39]])