## Explores Customer Sentiments of AirBnB Guests:
In Seattle, Newyork and San Francisco area:

Tries to explore the question, are customers happy?

In [1]:
import spacy
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
tqdm.pandas()
nlp = spacy.load('en_core_web_lg', disable = ['ner', 'parser'])
import nltk
from stop_words import get_stop_words
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from collections import Counter
import syllables
import matplotlib.pyplot as plt
%matplotlib inline
import cld2
import altair as alt
from altair_saver import save
import unicodedata

In [2]:
# enable correct rendering (unnecessary in later versions of Altair)
alt.renderers.enable('default')
# uses intermediate json files to speed things up
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

### 1. Read files

In [3]:
# Read Reviews: 
san = pd.read_csv('../Dataset/Sanfrancisco/detailed_reviews.csv')
new = pd.read_csv('../Dataset/Newyork/detailed_reviews.csv')
sea = pd.read_csv('../Dataset/Seattle/detailed_reviews.csv')

# Add location name to each review
san['location']= 'sanfrancisco'
new['location'] = 'newyork'
sea['location'] = 'seattle'


In [4]:
# Concat reviews in seattle, new york and sanfrancisco:
df_rew = pd.concat([sea, new, san], axis= 0, ignore_index=True)

In [5]:
df_rew.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,location
0,2318,146,2008-09-15,2451,Kevin,1000 times better than staying at a hotel.,seattle
1,2318,126302712,2017-01-10,12332845,Jessica,"Our family (two couples, a two year old and an...",seattle
2,2318,140977084,2017-04-01,4789466,Ivan,Top of the list locations we have stayed at! T...,seattle
3,2318,147262504,2017-04-25,55817131,Mike,"SUCH an awesome place. Very clean, quiet and s...",seattle
4,2318,161806368,2017-06-18,113604590,Pete,We flew quite a distance to be at our only dau...,seattle


### 2. Clean File: 

In [6]:
def remove_not_stopwords():
    
    """Return stopwords without negation"""
    
    # Get stopwords:
    stopword = list(stopwords.words('english'))
    
    # Get more stopwords: 
    stop_word = list(get_stop_words('en'))
    
    # Get all stop words:
    stop = stopword+stop_word
    
    # Remove negation from stopwords (n't)
    stop_clean = []
    for i in stop_word: 
        stop_clean.append(re.sub("(n't)", '', i))
    
    # Remove remaining negated stopwords 
    stop_neutral = [i for i in stop_clean if not i in ['but', 'ca', 'cannot','no', 'nor', 'not', 'sha', 'wo']]
    
    return stop_neutral

In [7]:
remove_not_stopwords()[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and']

In [8]:
from google_trans_new import google_translator  
#translator = google_translator() 
#detect = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)

def normalize_text(df,text):
    """Normalizes text in to lower capital letters, removes non-alpha numeric characters,
        digits, new lines, stopwords from the original text"""
    
    stop_neutral = remove_not_stopwords()
    
    df['norm_comments'] = (df[text]
                .progress_apply(lambda x: str(x).lower())                                                                     #lower text
                .progress_apply(lambda z: re.sub('\n',"", z))                                                                 #remove new line
                .progress_apply(lambda s: [i for i in s.split() if not i in stop_neutral])                                    #remove stopwords
                .progress_apply(lambda j: " ".join(j))                                                                        #join text  
                .progress_apply(lambda y: re.sub('[\,\:\;\!\_\-\(\)\"\\/\.\d+]',"", y))                                       #remove non-alpha numeric characters except '', 
                .progress_apply(lambda e: ''.join([l for l in str(e) if unicodedata.category(l)[0] not in ('S', 'M', 'C')]))) #remove undesired utf-8 characters
    
    # detect language: en=1, all other languages 0
    df['lang_type'] = pd.DataFrame([cld2.detect(df['norm_comments'][i]) for i in range(0,len(df['norm_comments']))])['details'].progress_apply(lambda x: x[0][0].lower())
    
    # assign english=1, others= 0
    df['english'] = np.where(df['lang_type']=='english', 1, 0)
    
    return df

def convert_date(df): 
    """Converts date in to datetime object and gets year, month, week columns"""
    
    # Convert date to datetime: 
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df.date.dt.month
    df['year'] = df.date.dt.year
    df['week'] = df.date.dt.week
    
    return df

In [9]:
df_norm = normalize_text(df_rew, 'comments')
df_norm = convert_date(df_rew)

100%|██████████| 1357656/1357656 [00:02<00:00, 624378.18it/s]
100%|██████████| 1357656/1357656 [00:02<00:00, 579337.75it/s]
100%|██████████| 1357656/1357656 [01:36<00:00, 14008.54it/s]
100%|██████████| 1357656/1357656 [00:02<00:00, 651562.98it/s]
100%|██████████| 1357656/1357656 [00:06<00:00, 211788.10it/s]
100%|██████████| 1357656/1357656 [00:58<00:00, 23042.32it/s]
100%|██████████| 1357656/1357656 [00:01<00:00, 898114.89it/s]


In [10]:
df_norm.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,location,norm_comments,lang_type,english,month,year,week
0,2318,146,2008-09-15,2451,Kevin,1000 times better than staying at a hotel.,seattle,times better staying hotel,english,1,9,2008,38
1,2318,126302712,2017-01-10,12332845,Jessica,"Our family (two couples, a two year old and an...",seattle,family two couples two year old infant incredi...,english,1,1,2017,2
2,2318,140977084,2017-04-01,4789466,Ivan,Top of the list locations we have stayed at! T...,seattle,top list locations stayed at house perfect fam...,english,1,4,2017,13
3,2318,147262504,2017-04-25,55817131,Mike,"SUCH an awesome place. Very clean, quiet and s...",seattle,awesome place clean quiet spacious loved it,english,1,4,2017,17
4,2318,161806368,2017-06-18,113604590,Pete,We flew quite a distance to be at our only dau...,seattle,flew quite distance daughter's graduation need...,english,1,6,2017,24


In [11]:
# See unique languages
df_norm['lang_type'].unique()

# Not all reviews are in english

array(['english', 'french', 'dutch', 'galician', 'unknown', 'chinese',
       'japanese', 'german', 'italian', 'spanish', 'swedish', 'chineset',
       'danish', 'korean', 'portuguese', 'norwegian', 'persian', 'polish',
       'russian', 'finnish', 'catalan', 'greek', 'scots_gaelic',
       'turkish', 'indonesian', 'thai', 'cebuano', 'hungarian', 'irish',
       'hebrew', 'kurdish', 'slovak', 'czech', 'arabic', 'vietnamese',
       'afrikaans', 'malagasy', 'serbian', 'icelandic', 'slovenian',
       'croatian', 'estonian', 'romanian', 'bosnian', 'swahili', 'hmong',
       'javanese', 'sesotho', 'ukrainian', 'basque', 'malay', 'uzbek',
       'ganda', 'kinyarwanda', 'sundanese'], dtype=object)

### 3. Apply Sentiment analysis on unsupervised text: 

In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment(df, text):
    """Takes normalized text as datframe to estimate overall sentiment_polarity"""
    
    # set sentiment: 
    df['scores'] = df[text].progress_apply(lambda x: analyser.polarity_scores(x))
    df['compound'] = df['scores'].progress_apply(lambda y: y['compound'])
    
    # assign rating (positive: >=0.05, negative: <=-0.05 , neutral: >-0.05 & < 0.05)
    df['rating'] = np.where(df['compound']>=0.05, 'positive', (np.where((df['compound']<0.05) & (df['compound']> -0.05), 'neutral', 'negative')))
        
    return df
        


In [13]:
df_senti = sentiment(df_norm, 'norm_comments')

100%|██████████| 1357656/1357656 [07:06<00:00, 3181.99it/s]
100%|██████████| 1357656/1357656 [00:01<00:00, 1119655.22it/s]


In [14]:
df_senti['rating'].unique()

array(['positive', 'neutral', 'negative'], dtype=object)

In [15]:
df_senti.head(2)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,location,norm_comments,lang_type,english,month,year,week,scores,compound,rating
0,2318,146,2008-09-15,2451,Kevin,1000 times better than staying at a hotel.,seattle,times better staying hotel,english,1,9,2008,38,"{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'comp...",0.4404,positive
1,2318,126302712,2017-01-10,12332845,Jessica,"Our family (two couples, a two year old and an...",seattle,family two couples two year old infant incredi...,english,1,1,2017,2,"{'neg': 0.033, 'neu': 0.607, 'pos': 0.36, 'com...",0.9974,positive


In [16]:
df_senti.location.value_counts()

newyork         821858
sanfrancisco    279937
seattle         255861
Name: location, dtype: int64

### 4. Visualize Sentiments by Language & Year

In [17]:
def summarize_sentiments(df, col = 'lang_type'): 
    """Calculates % sentiment rating by column (language or year)"""
    
    # Count ratings: 
    count = df.groupby(['rating', col, 'english', 'location'])[col].count()
    
    # Total count of reviews:
    totalcount = df.groupby('location')[col].count()
    pct = count/totalcount
    
    df_senti_grp = pd.concat([count, pct], axis = 1, keys = ['count', 'pct']).reset_index()
    
    return df_senti_grp


In [18]:
summary_sentiments_lang = summarize_sentiments(df_senti, col = 'lang_type')
summary_sentiments_year = summarize_sentiments(df_senti, col = 'year')

In [19]:
display(summary_sentiments_lang.head(), summary_sentiments_year.head(), df_senti.head(2))

Unnamed: 0,rating,lang_type,english,location,count,pct
0,negative,afrikaans,0,newyork,1,1e-06
1,negative,catalan,0,newyork,33,4e-05
2,negative,catalan,0,sanfrancisco,6,2.1e-05
3,negative,catalan,0,seattle,1,4e-06
4,negative,chinese,0,newyork,4,5e-06


Unnamed: 0,rating,year,english,location,count,pct
0,negative,2009,0,newyork,1,1e-06
1,negative,2010,0,newyork,1,1e-06
2,negative,2010,0,sanfrancisco,1,4e-06
3,negative,2010,1,newyork,5,6e-06
4,negative,2010,1,sanfrancisco,2,7e-06


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,location,norm_comments,lang_type,english,month,year,week,scores,compound,rating
0,2318,146,2008-09-15,2451,Kevin,1000 times better than staying at a hotel.,seattle,times better staying hotel,english,1,9,2008,38,"{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'comp...",0.4404,positive
1,2318,126302712,2017-01-10,12332845,Jessica,"Our family (two couples, a two year old and an...",seattle,family two couples two year old infant incredi...,english,1,1,2017,2,"{'neg': 0.033, 'neu': 0.607, 'pos': 0.36, 'com...",0.9974,positive


In [20]:
def plot_sentiments(df1, df2, df3): 
    """Plot a bar graph showing % sentiments by Language and Year.
       Input : 1. summary_sentiments_lang 
               2. summary_sentiments_year
               3. df_senti
        Output: Plot of summary sentiments"""
    
    # Build radio button: 
    
    # get cities: 
    city = list(df3['location'].unique())
    
    # radio, name
    selectcity = alt.selection_single(fields=['location'],
                                      init={'location': city[0]},
                                      bind=alt.binding_radio(options=city, name = 'Select City'))
    
    # Sentiments by Language:
    sentiment_plot_lang = (alt.Chart(df1)
                     .mark_bar()
                     .encode(y=alt.Y('pct:Q', 
                                     stack= 'normalize',
                                     axis = alt.Axis(title = '% Sentiment Rating English vs Others',
                                     format= '%', labelColor= 'grey')),
                             x = alt.X('rating:N',
                                       axis = alt.Axis(title = 'Sentiments Rating')),
                             color = alt.Color('english:N', 
                                               sort= alt.EncodingSortField(field = 'english',order = 'descending'),
                                               scale = alt.Scale(scheme = 'cividis')),
                             tooltip= [alt.Tooltip('location:N', title = 'City'),
                                       alt.Tooltip('lang_type:N', title = 'Language Name'),
                                       alt.Tooltip('rating:N', title = 'Sentiment Type'),
                                       alt.Tooltip('pct:Q', title = 'Percent', format = '.2%'),
                                       alt.Tooltip('count:Q', title = 'Count of Records'),
                                       ])
                      .properties(width =320,
                                  height = 400,
                                  title = ['% Sentiment Rating by Language (Normalized)'])
                           .add_selection(selectcity).transform_filter(selectcity))
    
    # Sentiments by Year:  
    sentiment_plot_year = (alt.Chart(df2)
                     .mark_bar(opacity = 0.8)
                     .encode(y=alt.Y('pct:Q', 
                                     stack= 'zero',
                                     axis = alt.Axis(title = '% Sentiments Rating',
                                     format= '%', labelColor= 'grey')),
                             x = alt.X('year:O',
                                       axis = alt.Axis(title = 'Year')),
                             color = alt.Color('rating:N',
                                               sort= alt.EncodingSortField(field = 'rating', order = 'descending'),
                                               scale = alt.Scale(scheme = 'cividis')),
                             tooltip= [alt.Tooltip('location:N', title = 'City'),
                                       alt.Tooltip('year:O', title = 'Year'),
                                       alt.Tooltip('rating:N', title = 'Sentiment Type'),
                                       alt.Tooltip('pct:Q', title = 'Percent', format = '.2%'),
                                       alt.Tooltip('count:Q', title = 'Count of Records')],
                             size = alt.Size('pct:Q', scale = alt.Scale(range = [10,150])))
                      .properties(width =320,
                                  height = 400,
                                  title = ['% Sentiment Rating by Year',])
                           .add_selection(selectcity).transform_filter(selectcity))
    
    
    # Count operational months : 
    count_yearmonths_plot = (alt.Chart(df3.groupby(['year', 'location'])['month'].unique().apply(lambda x: len(x)).reset_index())
                    .mark_circle(color = 'red', opacity = 0.6)
                    .encode(x = alt.X('year:O', axis = None),
                            y= alt.Y('month:Q', axis = None),
                            size = alt.Size('month:Q', legend = alt.Legend(title = ["total months", "operated/year"], values = [4,8,12])), 
                            tooltip = [alt.Tooltip('month:Q', title = 'total months operated/year')])
                    .properties(width =320, height =20)
                             .add_selection(selectcity).transform_filter(selectcity))

    
    
    # Side-by-Side Plots:
    return ((sentiment_plot_lang | sentiment_plot_year & count_yearmonths_plot)
            .configure_view(strokeWidth =0)
            .configure(background = '#f0f0f0')
            .configure_view(strokeWidth =0)
            .configure_axis(grid = True)
            .resolve_scale(color='independent'))

In [21]:
# Final Plot: 
chart_sentiments = plot_sentiments(summary_sentiments_lang, summary_sentiments_year, df_senti)
chart_sentiments

In [22]:
# Save sentiment features as csv file: 
df_senti.to_pickle('../Output/sentiment_feature.pkl')

### Observations: 
1. Air BnB business operated between 2009 to 2021.
2. 2019 accounted for maximum reviews (>22% just before pandemic) compared to other years.
3. 2020 resulted in sudden decline in guest bookings due to pandemic.  
4. Negative reviews account for >2% among English and Foreign speakers. 
5. No data beyond April 2021. 
6. It's likely that positive sentiments have fueled business growth, but we cannot rule out the possibility of sample bias in the dataset. 

### Other learnings: 
Below function gave us run time error, when we tried to translate other languages into English. However, this should work for smaller datasets. 

In [23]:
# Translate non-english sentences

def translate(df, text):
    " Resulted in 429 (too many requests error)"
    ind = df[text][(df['lang_type']!='english') & (df['lang_type']!='unknown')].index
    df[text] = df[text].loc[ind].progress_apply(lambda x: translator.translate(x))
    return df