In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import re
import json
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/slindhult/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
general = pd.read_csv('generalized.csv')

In [3]:
with open('positive_vocab.txt', 'r') as filehandle:
    poslist = json.load(filehandle)
with open('negative_vocab.txt', 'r') as neg:
    negative = json.load(neg)

In [4]:
def make_new_data(url):
    uClient = uReq(url)
    page_html = uClient.read()
    page_soup = BeautifulSoup(page_html, 'html.parser')
    reviews = page_soup.find_all(class_="review_item")
    avg = str(page_soup.find(class_="bui-review-score__badge"))
    average_score = float(re.findall('Scored(.+)" class',avg)[0])
    review_list = []
    for review in reviews:
        string = str(review)
        reviewer_score = float(re.findall('Scored (.+) "', string)[0])
        
        tag = re.findall('•(.+)', string)
        tags = [i[8:] for i in tag]
        try:
            neg = string.split('</svg>')[1].split('</p>')[0]
        except:
            neg = 'No Negative'
        try:
            pos = string.split('</svg>')[2].split('</p>')[0]
        except:
            pos = 'No Positive'
        review_dict = {'Average_Score':average_score, 'Reviewer_Score':reviewer_score, 'Tags':tags,
                      'Negative_Review':neg, 'Positive_Review':pos}
        review_list.append(review_dict)
    return review_list

In [5]:
def clean_tags(dataset, column_name):
    for index, i in enumerate(dataset[column_name]):
        for idx, t in enumerate(i):
            if t not in ['Stayed 1-2 nights','Stayed 3-4 nights', 'Stayed 5+ nights', 'Business trip', 'Solo traveler', 'Leisure trip',
                         'Couple', 'Group', 'Family with young children', 'Family with older children']:
                if t in ['Stayed 1 night','Stayed 2 nights']:
                    dataset[column_name][index][idx] = 'Stayed 1-2 nights'
                if t in ['Stayed 3 nights','Stayed 4 nights']:
                    dataset[column_name][index][idx] = 'Stayed 3-4 nights'
                if t in ['Stayed 5 nights','Stayed 6 nights', 'Stayed 7 nights', 'Stayed 8 nights', 'Stayed 9 nights',
                         'Stayed 10 nights',  'Stayed 11 nights',
                     'Stayed 12 nights', 'Stayed 13 nights', 'Stayed 14 nights', 'Stayed 15 nights', 'Stayed 16 nights',
                         'Stayed 17 nights','Stayed 18 nights', 'Stayed 19 nights', 'Stayed 20 nights',
                     'Stayed 21 nights', 'Stayed 22 nights', 'Stayed 23 nights', 'Stayed 24 nights', 'Stayed 25 nights',
                         'Stayed 26 nights',
                     'Stayed 27 nights', 'Stayed 28 nights', 'Stayed 29 nights', 'Stayed 30 nights', 'Stayed 31 nights',]:
                    dataset[column_name][index][idx] = 'Stayed 5+ nights'
    for i in dataset[column_name]:
        for idx, t in enumerate(i):            
                if t not in ['Stayed 1-2 nights','Stayed 3-4 nights', 'Stayed 5+ nights', 'Business trip', 
                             'Solo traveler', 'Leisure trip', 'Couple', 'Group', 'Family with young children', 
                             'Family with older children']:
                    i.pop(idx)
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    tagdf = pd.DataFrame(mlb.fit_transform(dataset[column_name]),columns=mlb.classes_, index=dataset.index)


    dataset = dataset.join(tagdf)
    dataset.drop(column_name, axis=1, inplace=True)
    
    return dataset

In [6]:
def get_vader(dataframe, negative_review_col, positive_review_col):

    analyser = SentimentIntensityAnalyzer()
    dataframe[negative_review_col] = dataframe[negative_review_col].apply(lambda x: str(x).replace("No Negative", ""))
    dataframe[positive_review_col] = dataframe[positive_review_col].apply(lambda x: str(x).replace("No Positive", ""))
    
    dataframe['vader_pos_sent'] = dataframe[positive_review_col].apply(lambda x: analyser.polarity_scores(x)['compound'])
    dataframe['vader_neg_sent'] = dataframe[negative_review_col].apply(lambda x: analyser.polarity_scores(x)['compound'])

In [7]:
def review_word_count(dataframe, negative_review_col, positive_review_col):

    analyser = SentimentIntensityAnalyzer()
    
    dataframe['Review_Total_Negative_Word_Counts'] = dataframe[positive_review_col].apply(lambda x: len(x.split()))
    dataframe['Review_Total_Positive_Word_Counts'] = dataframe[negative_review_col].apply(lambda x: len(x.split()))

In [8]:
def count_vectorize(dataframe, negative_review_col, positive_review_col, poslist, neglist):
    countpos = CountVectorizer(stop_words='english', vocabulary = poslist)
    countneg = CountVectorizer(stop_words='english', vocabulary = neglist)
    
    pos_count = countpos.fit_transform(dataframe[positive_review_col].values.astype('U'))
    pos_col_names = countpos.get_feature_names()
    pos_count = pos_count.todense()
    pos_count = pd.DataFrame(pos_count, columns = pos_col_names)
    dataframe = dataframe.join(pos_count)
    
    neg_count = countneg.fit_transform(dataframe[negative_review_col].values.astype('U'))
    neg_col_names = countneg.get_feature_names()
    neg_count = neg_count.todense()
    neg_count = pd.DataFrame(neg_count, columns = neg_col_names)
    neg_count = neg_count.add_suffix('_neg')
    dataframe = dataframe.join(neg_count)
    
    dataframe.drop(negative_review_col, axis=1, inplace=True)
    dataframe.drop(positive_review_col, axis=1, inplace=True)
    return dataframe

In [9]:
def fill_missing_cols(dataframe, general_cols_list, cols_list):
    missing = [i for i in general.columns if i not in dataframe.columns]
    for i in missing:
        dataframe[i] = 0

In [None]:
html_doc = 'https://www.booking.com/hotel/us/boulderado.html?aid=355028;sid=0113f0ad8d73e9bed1013fc062ca0ea9;dest_id=20017143;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=19;hpos=19;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1590554078;srpvid=6b7d202e9c440044;type=total;ucfs=1&#tab-reviews'
boulderado = make_new_data(html_doc)
data = pd.DataFrame(boulderado)

In [None]:
data = clean_tags(data, 'Tags')
get_vader(data, 'Negative_Review', 'Positive_Review')
review_word_count(data, 'Negative_Review', 'Positive_Review')
data = count_vectorize(data, 'Negative_Review', 'Positive_Review', poslist, negative)
fill_missing_cols(data, general.columns, data.columns)

In [None]:
data

In [16]:
y = general.pop('Reviewer_Score')
X = general

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [None]:
gdbr = GradientBoostingRegressor(learning_rate=0.01,
                                  loss='ls',
                                 max_depth=35,
                                  n_estimators=1250,
                                 min_samples_leaf=80,
                                 max_features=60,
                                  random_state=1,
                                verbose = True)

gdbr.fit(X_train, y_train)
y_hat = gdbr.predict(X_test)
mean_absolute_error(y_test, y_hat)

      Iter       Train Loss   Remaining Time 
         1           2.6541          109.83m
         2           2.6276          124.11m
         3           2.6019          125.13m
         4           2.5765          126.90m
         5           2.5517          127.73m
         6           2.5272          126.90m
         7           2.5036          129.86m
         8           2.4800          129.44m
         9           2.4571          128.42m
        10           2.4345          128.55m
        20           2.2285          126.15m
        30           2.0560          124.11m
        40           1.9108          122.56m
        50           1.7884          122.36m
        60           1.6855          122.02m
        70           1.5986          121.78m
        80           1.5244          121.81m
        90           1.4615          121.89m
       100           1.4075          121.94m


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
mean_absolute_error(y_test, y_hat)

In [None]:
import pickle 
  
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(gdbr) 
  
# Load the pickled model 
gdbr_from_pickle = pickle.loads(saved_model) 
  
# Use the loaded pickled model to make predictions 
gdbr_from_pickle.predict(X_test) 

In [None]:
datay = data.pop('Reviewer_Score')

In [None]:
data_preds = gdbr.predict(data)

In [None]:
mean_absolute_error(datay, data_preds)

In [10]:
denv = 'https://www.booking.com/hotel/us/four-seasons-denver.html?aid=355028;sid=0113f0ad8d73e9bed1013fc062ca0ea9;all_sr_blocks=45379102_204868004_2_0_0;checkin=2020-07-16;checkout=2020-07-17;dest_id=20017349;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;highlighted_blocks=45379102_204868004_2_0_0;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;sr_pri_blocks=45379102_204868004_2_0_0__36550;srepoch=1592973316;srpvid=0fa8204262030060;type=total;ucfs=1&#tab-reviews'

In [13]:
denver = make_new_data(denv)
datad = pd.DataFrame(denver)
datad = clean_tags(datad, 'Tags')
get_vader(datad, 'Negative_Review', 'Positive_Review')
review_word_count(datad, 'Negative_Review', 'Positive_Review')
datad = count_vectorize(datad, 'Negative_Review', 'Positive_Review', poslist, negative)
fill_missing_cols(datad, general.columns, data.columns)

In [14]:
datad

Unnamed: 0,Average_Score,Reviewer_Score,Business trip,Couple,Family with young children,Group,Leisure trip,Solo traveler,Stayed 1-2 nights,Stayed 3-4 nights,...,wall_neg,water_neg,way_neg,went_neg,wifi_neg,window_neg,work_neg,working_neg,Family with older children,Stayed 5+ nights
0,9.2,8.0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,9.2,8.0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,9.2,8.0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9.2,8.0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,9.2,10.0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,9.2,9.2,0,0,1,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,9.2,10.0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,9.2,7.5,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
8,9.2,10.0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,9.2,9.6,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
filename = 'gdbr_model.sav'
pickle.dump(model, open(filename, 'wb'))

NameError: name 'saved_model' is not defined

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)r