# Price prediction using random forest

Following steps consist in an attempt to predict price using random forest classifier

## Loading data
Loading data from CSV file "listings"

In [None]:
%matplotlib inline

import pandas as pd

# Loading CSV file
listings = pd.read_csv("../input/listings.csv")

listings.head()

## Preprocessing data
Preprocessing data by :
<li>fixing formats</li>
<li>filling N/A values</li>
<li>creating binary features to represent categorical ones</li>

### Fixing price format
Price is given with unit and commas, not really usable

In [None]:
import numpy as np

print(listings.price.head())

# Fixing price column (removing unit, removing commas, converting to float)
listings.loc[:, 'price'] = listings.loc[:, 'price'] \
.apply(lambda x: x.replace('$','')) \
.apply(lambda x: x.replace(',','')) \
.astype(np.float)

print(listings.price.head())

### Creating categorical price feature
A categorical feature is required to use random forest classifier

In [None]:
# Getting price percentiles
listings['price'].describe()

In [None]:
# Adding a categorical feature representing price using percentiles
listings['price_category'] = pd.cut(listings['price'], bins=[10, 85, 150, 220, 4000], retbins=False, labels=[0, 1, 2, 3])

print(listings.loc[1:10, 'price_category'])

### Filling N/A values
Looking for columns whose at least 90% of values are filled in, then filling N/A values 

In [None]:
FILL_IN_THRESHOLD = 0.9

# Looking for features with enough filled in values
listings.describe().loc['count', (listings.describe().loc['count'] > listings.shape[0] * FILL_IN_THRESHOLD)
                             & (listings.describe().loc['count'] < listings.shape[0])]

In [None]:
# Filling these features N/A values
listings["bathrooms"] = listings["bathrooms"].fillna(listings["bathrooms"].median())
listings["bedrooms"] = listings["bedrooms"].fillna(listings["bedrooms"].median())
listings["beds"] = listings["beds"].fillna(listings["beds"].median())

print(listings.describe().loc['count', listings.describe().loc['count'] > listings.shape[0] * FILL_IN_THRESHOLD])

### Extracting neighbourhood binary features
Creating binary features (one per neighbourhood) to represent neighbourhood one

In [None]:
# Getting all existing neighbourhoods
listings.neighbourhood.unique()

In [None]:
# Defining mapping for all existing neighbourhoods
neighourhoodMapping = {"Undefined": 0, "Roslindale": 1, "Jamaica Plain" : 2, "Mission Hill": 3, "Fenway/Kenmore": 4,
                        "Back Bay": 5, "Leather District": 6, "Chinatown": 7, "Hyde Park": 8, "North End": 9,
                        "Roxbury": 10, "South End": 11, "Mattapan": 12, "East Boston": 13, "South Boston": 14,
                        "Charlestown": 15, "West End": 16, "Beacon Hill": 17, "Theater District": 18,
                        "Downtown Crossing": 19, "Downtown": 20, "Financial District": 21, "Government Center": 22,
                        "Allston-Brighton": 23, "West Roxbury": 24, "Chestnut Hill": 25, "Dorchester": 26,
                        "Brookline": 27, "Cambridge": 28, "Somerville": 29, "Harvard Square": 30}

listings.neighbourhood = listings.neighbourhood.fillna(0)

# Looping on neighbourhood mapping tuples
for k,v in neighourhoodMapping.items():
    
    # Setting mapping value for all dataframe samples whose neighbourhood matches current one
    listings.loc[listings.neighbourhood == k, 'neighbourhood_mapping'] = v

listings.neighbourhood_mapping = listings.neighbourhood_mapping.replace(np.nan, 0.0)

print(listings.neighbourhood.head())
print(listings.neighbourhood_mapping.head())

### Extracting amenities binary features
Creating binary features (one per amenity) to represent amenities one

In [None]:
import re

# Getting all different amenities from dataframe by removing braces and splitting using ',' separator
amenities = pd.Series(','.join(listings['amenities'].str[1:-1]).split(',')).unique()

print(amenities)

In [None]:
# Looping on all existing amenities
for i, v in np.ndenumerate(amenities):
    
    # Fixing current value double 
    amenity = re.sub(r'^"|"$', '', v)
    
    if amenity :
        
        # Setting amenity value for all dataframe samples depending on their 'amenities' feature value
        listings[amenity] = listings['amenities'].str.contains(amenity)

listings.TV.head()

### Removing potential outliers
Identifying potential outliers by taking only 'accomodates' and 'price' into account, these potential outliers will be ignored later

In [None]:
from sklearn import svm

outlierListings = listings.loc[:, ['accommodates', 'price']]

# Normalizing features
outlierListings = (outlierListings - outlierListings.mean()) / (outlierListings.max() - outlierListings.min())   

# Initializing classifier
outlierClassifier = svm.OneClassSVM(nu=0.003, gamma=2.0)
    
# Fitting classifier
outlierClassifier.fit(outlierListings)

# Classifying inliers/outliers
decisions = outlierClassifier.decision_function(outlierListings)
inliers=outlierListings[decisions >= 0]
outliers=outlierListings[decisions < 0]

In [None]:
import matplotlib.pyplot as plt

plt.figure()        
    
# Defining grid
gridX, gridY = np.meshgrid(np.linspace(-0.5, 1.5, 100), np.linspace(-0.5, 1.5, 100))
    
# Computing decision for each point of the grid
gridDecisions = outlierClassifier.decision_function(np.c_[gridX.ravel(), gridY.ravel()])
    
# Plotting decision boundary (each point of the grid whose decision value is 0)
gridDecisions = gridDecisions.reshape(gridX.shape)
plotBoundary = plt.contour(gridX, gridY, gridDecisions, levels=[0], linewidths=2, colors='blue')
plt.clabel(plotBoundary, inline=1, fontsize=12)

# Plotting inliers and outliers
plt.scatter(inliers.loc[:, 'accommodates'], inliers.loc[:, 'price'], label="Inliers", color='green', alpha=0.2)
plt.scatter(outliers.loc[:, 'accommodates'], outliers.loc[:, 'price'], label="Outliers", color='red', alpha=1.0)
    
plt.xlabel("Accomodates (normalized)")
plt.ylabel("Price (normalized)")
plt.title("Detecting potential outliers using one class SVM")
plt.legend()    

In [None]:
# Ignoring potential outliers
noOutlierListings = listings[decisions >= 0]

print("Initial number of samples; " + str(len(listings.index)))
print("Number of samples wihtout outliers: " + str(len(noOutlierListings.index)))

### Selecting meaningful features

In [None]:
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression

features = ["id", "scrape_id", "host_id", "host_listings_count", "host_total_listings_count", "latitude", \
"longitude", "accommodates", "bathrooms", "bedrooms", "beds", "guests_included", "minimum_nights", \
"maximum_nights", "availability_30", "availability_60", "availability_90", "availability_365", \
"number_of_reviews", "calculated_host_listings_count"]

# Selecting features
featureSelector = SelectKBest(f_regression, k=5)
featureSelector.fit(noOutlierListings[features], noOutlierListings["price"])

# Getting raw p-values for each feature and transforming into scores
featureScores = -np.log10(featureSelector.pvalues_)

# Plotting scores
plt.bar(range(len(features)), featureScores)
plt.xticks(range(len(features)), features, rotation='vertical')
plt.show()

### Normalizing features

In [None]:
# Keeping only meaningful features and neighbourhood
normalizedFeatures = ["accommodates", "bathrooms", "bedrooms", "beds", "guests_included", "neighbourhood_mapping"]

# Selecting features
normalizedListings = noOutlierListings.loc[:, normalizedFeatures]

# Plotting distribution
plt.hist(normalizedListings.values)
plt.show()

In [None]:
# Normalizing features
normalizedListings = (normalizedListings - normalizedListings.mean()) / (normalizedListings.max() - normalizedListings.min())   

# Plotting distribution
plt.hist(normalizedListings.values)
plt.show()

In [None]:
# Keeping only meaningful features, neighbourhood and amenities
otherFeatures = ["TV", "Wireless Internet", "Kitchen", "Free Parking on Premises", "Pets live on this property",
                 "Dog(s)", "Heating", "Family/Kid Friendly", "Washer", "Dryer", "Smoke Detector",
                 "Fire Extinguisher", "Essentials", "Shampoo", "Laptop Friendly Workspace", "Internet",
                 "Air Conditioning", "Pets Allowed", "Carbon Monoxide Detector", "Lock on Bedroom Door",
                 "Hangers", "Hair Dryer", "Iron", "Cable TV", "First Aid Kit", "Safety Card",
                 "Gym", "Breakfast", "Indoor Fireplace", "Cat(s)", "24-Hour Check-in", "Hot Tub",
                 "Buzzer/Wireless Intercom", "Other pet(s)", "Washer / Dryer", "Smoking Allowed",
                 "Suitable for Events", "Wheelchair Accessible", "Elevator in Building", "Pool", "Doorman",
                 "Paid Parking Off Premises", "Free Parking on Street"]

# Selecting features
finalListings = pd.concat([normalizedListings, noOutlierListings.loc[:, otherFeatures]], axis=1)

features = normalizedFeatures + otherFeatures

### Preparing sets
Randomly preparing training and test using defined ratio

In [None]:
import random

TEST_SAMPLE_RATIO = 0.2

# Defining number of train and test samples
numberOfSamples = len(finalListings.index)
numberOfSamplesTest = int(numberOfSamples * TEST_SAMPLE_RATIO)
numberOfSamplesTrain = numberOfSamples - numberOfSamplesTest
    
# Defining test (and thus train) samples randomly
samplesTest = random.sample(list(finalListings.index), numberOfSamplesTest)
    
# Splitting data into train and test sets
listingsXTrain = finalListings.drop(samplesTest)
listingsXTest = finalListings.ix[samplesTest]

print("Number of train set elements: " + str(listingsXTrain.shape[0]))
print("Number of test set elements: " + str(listingsXTest.shape[0]))

# Splitting targets into training/testing sets
listingsY = noOutlierListings.loc[:, 'price_category']
listingsYTrain = listingsY[:-numberOfSamplesTest].reshape((numberOfSamplesTrain,1))
listingsYTest = listingsY[-numberOfSamplesTest:].reshape((numberOfSamplesTest,1))

print("Number of train target elements: " + str(listingsYTrain.shape[0]))
print("Number of test target elements: " + str(listingsYTest.shape[0]))

## Predicting

### Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Creating model
model = DecisionTreeClassifier(random_state=0)

# Training model using the training sets
model.fit(listingsXTrain, listingsYTrain.fillna(2))

In [None]:
print('Score (best possible score is 1.0): %.2f' % model.score(listingsXTest, listingsYTest))

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Creating model
model = RandomForestClassifier(n_estimators=10)
    
# Training model using the training sets
model.fit(listingsXTrain, listingsYTrain.fillna(2))

In [None]:
print('Score (best possible score is 1.0): %.2f' % model.score(listingsXTest, listingsYTest))