In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Scraping

In [None]:
url = 'http://insideairbnb.com/get-the-data/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
body = soup.find('body', class_='layout-module--content--2bfc1')
div = body.find('div', {"id": "___gatsby"})
file_urls = div.find_all('a')
links = [link.get('href') for link in file_urls]
listings_list = []
reviews_list = []
for link in links:
    if link != None:
        if "united-states" in link and "listings.csv.gz" in link:
            listings_list.append(link)
        if "united-states" in link and "reviews.csv.gz" in link:
            reviews_list.append(link)

In [None]:
listings_list

['http://data.insideairbnb.com/united-states/ny/albany/2023-10-01/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/nc/asheville/2023-09-13/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/tx/austin/2023-09-10/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/ma/boston/2023-09-16/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/mt/bozeman/2023-11-10/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/fl/broward-county/2023-09-21/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/ma/cambridge/2023-09-23/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/il/chicago/2023-09-12/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-09-16/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/oh/columbus/2023-09-22/data/listings.csv.gz',
 'http://data.insideairbnb.com/united-states/tx/dallas/2023-10-19/data/listings.csv

In [None]:
reviews_list

['http://data.insideairbnb.com/united-states/ny/albany/2023-10-01/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/nc/asheville/2023-09-13/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/tx/austin/2023-09-10/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/ma/boston/2023-09-16/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/mt/bozeman/2023-11-10/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/fl/broward-county/2023-09-21/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/ma/cambridge/2023-09-23/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/il/chicago/2023-09-12/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/nv/clark-county-nv/2023-09-16/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/oh/columbus/2023-09-22/data/reviews.csv.gz',
 'http://data.insideairbnb.com/united-states/tx/dallas/2023-10-19/data/reviews.csv.gz',
 'htt

It takes a significant amount of time and compute to run for all cities. Set the specific cities to analyze here. Currently, it is set to all cities.


In [None]:
listings_list = listings_list[:1]
reviews_list = reviews_list[:1]

## Sentiment score calculation

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
def analyze_sentiment(text):
    try:
        blob = TextBlob(text)
        sentiment_score = blob.sentiment.polarity
        return sentiment_score
    except:
        sentiment_score = 0
        return sentiment_score

In [None]:
textblob_big_df = pd.DataFrame()
vader_big_df = pd.DataFrame()
for listing, review in zip(listings_list, reviews_list):
    listings_df = pd.read_csv(listing)
    reviews_df = pd.read_csv(review)
    # get city and state from url
    url_parts = listing.split('/')
    state_name = url_parts[-5]
    city_name = url_parts[-4]
    city_name = city_name.replace("-", " ")
    listings_df = listings_df[['id','name','latitude','longitude','price','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value']]
    listings_df['state'] = state_name.upper()
    listings_df['city'] = city_name.title()
    listings_df['average'] = listings_df[['review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value']].mean(axis=1)
    listings_df = listings_df.groupby('id').agg({'name':'first', 'latitude':'first','longitude':'first','price':'first','average':'mean','state':'first','city':'first'})
    reviews_df = reviews_df[['listing_id','comments']]
    reviews_df = reviews_df.rename(columns={'listing_id':'id'})
    reviews_df['comments'] = reviews_df['comments'].replace('[^a-zA-Z\s]', '', regex=True)
    textblob_df = reviews_df
    textblob_df['sentiment_score'] = textblob_df['comments'].apply(analyze_sentiment)
    textblob_df = textblob_df.groupby('id').agg({'sentiment_score': 'mean'})
    textblob_df = pd.merge(listings_df,textblob_df, on = 'id')
    textblob_big_df = textblob_big_df.append(textblob_df)
    vader_df = reviews_df
    vader_df['sentiment_score'] = vader_df['comments'].apply(lambda comments:sid.polarity_scores(str(comments))['compound'])
    vader_df = vader_df.groupby('id').agg({'sentiment_score': 'mean'})
    vader_df = pd.merge(listings_df,vader_df, on = 'id')
    vader_big_df = vader_big_df.append(vader_df)
    print("Complete", city_name, state_name)

In [None]:
textblob_big_df.head(5)

Unnamed: 0_level_0,name,latitude,longitude,price,average,state,city,sentiment_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1489424,Home in Albany · ★4.73 · 1 bedroom · 1 bed · 1...,42.66719,-73.8158,$50.00,4.791429,NY,Albany,0.38476
2992450,Rental unit in Albany · ★3.56 · 2 bedrooms · 2...,42.65789,-73.7537,$70.00,3.747143,NY,Albany,0.208633
3820211,Rental unit in Albany · ★4.75 · 1 bedroom · 1 ...,42.65222,-73.76724,$115.00,4.832857,NY,Albany,0.396455
5651579,Rental unit in Albany · ★4.51 · Studio · 1 bed...,42.64615,-73.75966,$68.00,4.668571,NY,Albany,0.381678
6623339,Rental unit in Albany · ★4.73 · 1 bedroom · 1 ...,42.65222,-73.76724,$140.00,4.761429,NY,Albany,0.426687


In [None]:
vader_big_df.head(5)

Unnamed: 0_level_0,name,latitude,longitude,price,average,state,city,sentiment_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1489424,Home in Albany · ★4.73 · 1 bedroom · 1 bed · 1...,42.66719,-73.8158,$50.00,4.791429,NY,Albany,0.792217
2992450,Rental unit in Albany · ★3.56 · 2 bedrooms · 2...,42.65789,-73.7537,$70.00,3.747143,NY,Albany,0.493991
3820211,Rental unit in Albany · ★4.75 · 1 bedroom · 1 ...,42.65222,-73.76724,$115.00,4.832857,NY,Albany,0.81154
5651579,Rental unit in Albany · ★4.51 · Studio · 1 bed...,42.64615,-73.75966,$68.00,4.668571,NY,Albany,0.666715
6623339,Rental unit in Albany · ★4.73 · 1 bedroom · 1 ...,42.65222,-73.76724,$140.00,4.761429,NY,Albany,0.768897


In [None]:
from google.colab import files
textblob_big_df.to_csv('listing_sentiment_textblob.csv')
files.download('listing_sentiment_textblob.csv')

vader_big_df.to_csv('listing_sentiment_vader.csv')
files.download('listing_sentiment_vader.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
city_df_textblob = textblob_big_df.groupby("city").agg({"sentiment_score":"mean", "average":"mean"})
city_df_vader = vader_big_df.groupby("city").agg({"sentiment_score":"mean", "average":"mean"})

In [None]:
city_df_textblob.to_csv('city_sentiment_textblob.csv')
files.download('city_sentiment_textblob.csv')

city_df_vader.to_csv('city_sentiment_vader.csv')
files.download('city_sentiment_vaderp.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>