In [3]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
reviews = pd.read_csv('data/reviews.csv')

In [5]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...


In [4]:
stemmer = nltk.PorterStemmer()

In [6]:
text = reviews.iloc[0,5]

In [30]:
def Tokenizer(str_input, rm_stopwords = True):
    
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    
    if rm_stopwords == True:
        words = [word for word in words if word not in nltk.corpus.stopwords.words('english')]
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [24]:
reviews.dtypes

listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
dtype: object

In [25]:
reviews.isnull().sum()

listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      0
comments         530
dtype: int64

In [26]:
# Remove NAs

reviews.dropna(inplace=True)

In [27]:
reviews.isnull().sum()

listing_id       0
id               0
date             0
reviewer_id      0
reviewer_name    0
comments         0
dtype: int64

In [16]:
text

'Daniel is really cool. The place was nice and clean. Very quiet neighborhood. He had maps and a lonely planet guide book in the room for you to use. I didnt have any trouble finding the place from Central Station. I would defintely come back! Thanks!'

In [14]:
Tokenizer(text)

['daniel',
 'realli',
 'cool',
 'place',
 'nice',
 'clean',
 'quiet',
 'neighborhood',
 'map',
 'lone',
 'planet',
 'guid',
 'book',
 'room',
 'use',
 'didnt',
 'troubl',
 'find',
 'place',
 'central',
 'station',
 'would',
 'defint',
 'come',
 'back',
 'thank']

In [34]:
reviews.set_index('id', inplace=True)

### Load y

In [37]:
prices = pd.read_csv('data/listing_avg_price.csv')

In [38]:
prices.head()

Unnamed: 0,listing_id,Price
0,2818,61.954545
1,3209,162.553191
2,20168,122.545455
3,25428,125.0
4,27886,148.643216


In [41]:
prices.dtypes

listing_id      int64
Price         float64
dtype: object

In [47]:
reviews_price = pd.merge(reviews, prices, how='left', on='listing_id').dropna()

### Vectorizing text

Need to turn each review into an array with equal length.

First try vectorizing by word countTFIDF with skip-gram

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_price.comments, reviews_price.Price, test_size = 0.3, \
                                                    random_state = 123)

count = CountVectorizer(stop_words='english')

In [None]:
count_train = 

Slightly more sophisticated vectorization: TFIDF with skip-gram

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1,3))

In [51]:
tfidf_train = tfidf.fit_transform(X_train.values)
tfidf_test = tfidf.transform(X_test.values)

In [52]:
tfidf_train

<232308x5992341 sparse matrix of type '<class 'numpy.float64'>'
	with 18424210 stored elements in Compressed Sparse Row format>