In [35]:
import pandas as pd
import numpy as np
import nltk
import re

%matplotlib inline

In [3]:
reviews = pd.read_csv('data/reviews.csv')

In [3]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...


In [4]:
stemmer = nltk.PorterStemmer()

In [6]:
text = reviews.iloc[0,5]

In [1]:
def Tokenizer(str_input):
    
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [5]:
reviews.dtypes

listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
dtype: object

In [6]:
reviews.isnull().sum()

listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      0
comments         530
dtype: int64

In [4]:
# Remove NAs

reviews.dropna(inplace=True)

In [8]:
reviews.isnull().sum()

listing_id       0
id               0
date             0
reviewer_id      0
reviewer_name    0
comments         0
dtype: int64

In [5]:
reviews.set_index('id', inplace=True)

### Load y

In [12]:
prices = pd.read_csv('data/price_accom.csv')

In [15]:
prices.columns = ['listing_id', 'price_accom']

In [16]:
prices.dtypes

listing_id       int64
price_accom    float64
dtype: object

In [17]:
reviews_price = pd.merge(reviews, prices, how='left', on='listing_id').dropna()

In [37]:
reviews_price.price_accom.mean()

51.06982726640342

### Vectorizing text

Need to turn each review into an array with equal length.

First try vectorizing by word countTFIDF with skip-gram

In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_price.comments, reviews_price.price_accom, test_size = 0.3, \
                                                    random_state = 123)

In [21]:
count = CountVectorizer(analyzer=Tokenizer, stop_words='english')
svd = TruncatedSVD(n_components=300)

count_pipeline = make_pipeline(count, svd)

count_train = count_pipeline.fit_transform(X_train)
count_test = count_pipeline.transform(X_test)

Slightly more sophisticated vectorization: TFIDF with skip-gram

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = Tokenizer, stop_words='english', max_df=0.7, ngram_range=(1,2))
svd2 = TruncatedSVD(n_components=300)

tfidf_pipeline = make_pipeline(tfidf, svd2)

tfidf_train = tfidf_pipeline.fit_transform(X_train)
tfidf_test = tfidf_pipeline.transform(X_test)

Even more sophistication: word2vec

idea here is to find a vector for each word, then find the text meaning centroid by aggregating the vectors

### Model Building

Count data

In [32]:
# Regularized regression

from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn import metrics

count_lasso = Lasso(alpha=1.0)
# lasso.fit(count_train)
count_lasso_cv = cross_val_score(count_lasso, count_train, y_train, cv = 10, scoring='neg_mean_absolute_error')


In [33]:
count_lasso_cv

array([-16.57308616, -16.65893494, -16.6097964 , -16.72261957,
       -16.69301104, -16.79669583, -16.4768513 , -16.4831884 ,
       -16.47764606, -16.47291916])

In [38]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor


count_rf = RandomForestRegressor(n_estimators=100, max_depth = 5, n_jobs=8, random_state=123)
count_rf_cv = cross_val_score(count_rf, count_train, y_train, cv = 10, scoring='neg_mean_absolute_error')

In [42]:
count_rf_cv

array([-16.52453697, -16.56603924, -16.56286112, -16.69661095,
       -16.62459551, -16.73611787, -16.39975151, -16.41910766,
       -16.41406546, -16.43378926])

In [40]:
# XGBoost
%time
import xgboost as xgb

count_dmatrix = xgb.DMatrix(data=count_train,label=y_train)
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=count_dmatrix, params=params, nfold=3, \
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
[12:12:45] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[12:12:46] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:12:47] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:12:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[12:12:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:12:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:12:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:12:48] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roo

[12:13:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:13:02] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[12:13:02] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[12:13:02] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[12:13:02] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:13:03] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[12:13:03] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[12:13:03] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:13:03] /work

[12:13:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[12:13:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[12:13:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[12:13:16] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[12:13:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:13:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[12:13:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[12:13:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[12:13:18] /work

In [41]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,61.553641,0.566753,61.659506,1.20253
1,58.164758,0.587971,58.374166,1.271622
2,55.23429,0.561614,55.570014,1.337068
3,52.72293,0.569157,53.18959,1.399104
4,50.546003,0.614441,51.179117,1.457514
5,48.766403,0.616061,49.488884,1.50849
6,47.192064,0.675649,48.076476,1.557812
7,45.898079,0.650184,46.899293,1.599825
8,44.851028,0.65115,45.923005,1.635568
9,43.860914,0.679891,45.121362,1.667161


TFIDF Data

In [43]:
tfidf_lasso = Lasso(alpha=1.0)
# lasso.fit(count_train)
tfidf_lasso_cv = cross_val_score(tfidf_lasso, tfidf_train, y_train, cv = 5, scoring='neg_mean_absolute_error')

In [45]:
tfidf_lasso_cv

array([-16.70012074, -16.75253238, -16.81634537, -16.57151777,
       -16.56446131])

In [46]:
%time
tfidf_rf = RandomForestRegressor(n_estimators=100, max_depth = 5, n_jobs=8, random_state=123)
tfidf_rf_cv = cross_val_score(tfidf_rf, tfidf_train, y_train, cv = 5, scoring='neg_mean_absolute_error')

In [47]:
# XGBoost
%time
import xgboost as xgb

tfidf_dmatrix = xgb.DMatrix(data=tfidf_train,label=y_train)
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10}

tfidf_cv_results = xgb.cv(dtrain=tfidf_dmatrix, params=params, nfold=3, \
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.29 µs
[13:22:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[13:22:28] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[13:22:29] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[13:22:29] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[13:22:29] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=5
[13:22:30] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[13:22:30] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[13:22:30] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roo

[13:22:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[13:22:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[13:22:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[13:22:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[13:22:43] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[13:22:44] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[13:22:44] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[13:22:44] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[13:22:44] /work

[13:22:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[13:22:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[13:22:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[13:22:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[13:22:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[13:22:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[13:22:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[13:22:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[13:22:58] /work