## Analyzing reviews from webpages using webscraping tools and NLTK

### Step 1 : Build webscraping tool



<div class="rvw js-rvw" id="review-7157945" data-id="7157945" itemprop="reviews" itemscope itemtype="http://schema.org/Review"><div class="rvw__hdr"><div class="rvw__hdr-stat" itemtype="http://schema.org/Rating" itemscope itemprop="reviewRating"><meta itemprop="worstRating" content="1"><img data-rating="5.0" src="//media.consumeraffairs.com/static/img/icons/stars/ca-stars-5.61b7b8c7580b.svg" alt="Rated with 5 stars" class="stars-rtg stars-rtg--sm" /><meta itemprop="ratingValue" content="5"><meta itemprop="bestRating" content="5"></div></div><div class="rvw-aut"><div class="rvw__pic rvw__pic--no-pic"></div><div class="rvw-aut__inf"><strong class="rvw-aut__inf-nm" itemprop="author">MARSHALL of Atlanta, GA</strong><strong class="rvw-aut__inf-ver">

In [98]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from joblib import Parallel, delayed
import time
import numpy as np
import pandas as pd


def get_html(URL):

    page = ''
    while page == '':
        try:
            page = requests.get(url, verify=True)
            break
        except:
            time.sleep(5)
            continue
    return page

class get_html_content:
   
    def __init__(self,URL):
        
        
        self.page = requests.get(URL, verify=False)
        
        self.soup = BeautifulSoup(self.page.content, 'html.parser')
        
        self.meta = self.soup.find_all('meta')
        
        self.review_date = self.soup.find_all('span',attrs={'class':'ca-txt-cpt'})    
        
        self.review_auth = self.soup.find_all('div',attrs = {'class':'rvw-aut'})
        
        self.review_body = self.soup.find_all('div', class_='rvw-bd')
        
        self.rating_list = [[tag.attrs['content'],\
                        tag.attrs['itemprop']] \
                       for tag in self.meta if 'itemprop' in tag.attrs.keys() \
                       and tag.attrs['itemprop'].strip().lower() in ['ratingvalue'] ]
        
        if len(self.rating_list) == len(self.review_date):
            
            try:
                self.rating_vals = [int(self.rating_list[i][0]) for i in range(0,len(self.rating_list))]
            
            except:
                
                self.rating_vals = [np.NaN for i in range(0,len(self.rating_list))]
                
        elif len(self.rating_list) < len(self.review_date):
            
            try:
                
                val1 = [int(self.rating_list[i][0]) for i in range(0,len(self.rating_list))]
                val2 = [np.NaN for i in range(len(self.rating_list),len(self.review_date))]
                
                self.rating_vals = val1+val2
                
            except:
                
                self.rating_vals = [np.NaN for i in range(0,len(self.review_date))]
                


class rm_html_tags():


    def review_body(self,text):

        clean1 = re.compile('<div.*?<p><p>')
        
        clean2 = re.compile('<.*?>')
        
        text1 = re.sub(clean1, '', str(text))   
        
        text2 = re.sub(clean2, '', str(text1))     
        
        return re.sub('\s+',' ',str(text2)) 
    
    def review_date(self,text):
        
        date_output = 'NA'
        
        clean1 = re.compile('<.*?>')
        
        text1 = re.sub(clean1, '', str(text)).upper()
    
        
        if 'SEPT' in text1.upper():
            
            #print('correcting September dates')
            
            text1 = text1.upper().replace('SEPT','SEP')
    
        if "ORIGINAL REVIEW: " in text1:
            
           
            text2 = text1.replace("ORIGINAL REVIEW: ",'')#.replace('SEPT','SEP')
            
        elif "RESOLUTION RESPONSE: " in text1:
            
            text2 = text1.replace("RESOLUTION RESPONSE: ",'')#.replace('SEPT','SEP')
            
        else:
            
            text2 = text1

        try:    
            date_output = datetime.strptime(text2 , '%b. %d, %Y').strftime('%Y-%m-%d')
            
        except:
            
            pass
        
        try:
            date_output = datetime.strptime(text2 , '%B %d, %Y').strftime('%Y-%m-%d')
            
        except:
            
            pass
        
        
        return date_output
    
    def review_author(self,text):
        
        clean = re.compile('<.*?>')
        
        test_loc = re.sub(clean, '', str(text))  

        
        test_loc2 = test_loc.replace('\n','').replace('  ','').replace('Verified Reviewer','')

        test_loc2 = test_loc2.split(' of ')
        
        
        if len(test_loc2) == 2:
            
            return test_loc2
        
        else:
            test_loc2 = ['NA','NA']
            
        return test_loc2

        


def combine_review(URL):
    
    #page = None 
    output = pd.DataFrame()
    
    try:
    
        page = get_html_content(URL)
        print('getting page content')
        #print(page.content[0])
    except: 
        page = None
        print('page is None')
        pass 
    
    if page is None:
        
        pass

    else:
        

        content_class = rm_html_tags()

        n = len(page.review_body)

        for i in range(0,n):

            auth_list = content_class.review_author(page.review_auth[i])
            
            review_list = content_class.review_body(page.review_body[i]) 

            date_list = content_class.review_date(page.review_date[i])

            rating_list = page.rating_vals[i]

            output = output.append(pd.DataFrame({'author':auth_list[0],'location':auth_list[1],'date':[date_list],'review':[review_list],'rating':[rating_list]}))

    return output
        
        

### Test:

In [95]:
i = 1
front_part = 'https://www.consumeraffairs.com/travel/southwest.html?page='

end_part  = '#sort=top_reviews&filter=none'

url_test = front_part +str(i)+ end_part
#test = combine_review(url_test)

test2 = combine_review(url_test)



getting page content


In [92]:
n1 = 0

n2 = 20


start = time.time()
page_list = pd.concat(Parallel(n_jobs = -1)(delayed(combine_review)(front_part +str(i)+ end_part) for i in range(n1,n2)))
end = time.time()
print(end - start)

min_date = page_list['date'].min()

max_date = page_list['date'].max()

page_list.to_csv('consumer_affairs_webscrap/consumer_affairs_webscraping_batch_'+str(n1)+'_'+str(n2-1)+'.csv',index = '')


7.567324161529541


i = 1
front_part = 'https://www.consumeraffairs.com/travel/enterprise.html?page='

end_part  = '#sort=top_reviews&filter=none'
url_test = front_part + str(i)+end_part

test_page = get_html_content(url_test)

### Notes:

- How can we combine free text information to create a numerical score for each 
- Better survey design 
- How to account for different experiences in a customer's journey 
-- ques : causal inference, identify list of potential 'triggers' for each types of customers


1) Obtain topics from raw text review data 

2) Design questions that align with the topics from 1)

3) Create simple yes/no approach

