In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

Mounted at /content/drive


In [None]:
# installing textacy 
!pip install textacy

Collecting textacy
  Downloading textacy-0.11.0-py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 6.6 MB/s 
Collecting pyphen>=0.10.0
  Downloading pyphen-0.12.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 34.8 MB/s 
Collecting spacy>=3.0.0
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 20.2 MB/s 
Collecting jellyfish>=0.8.0
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 35.8 MB/s 
Collecting cytoolz>=0.10.1
  Downloading cytoolz-0.11.2.tar.gz (481 kB)
[K     |████████████████████████████████| 481 kB 53.1 MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_

In [None]:
# importing necessary libraries

from sklearn.decomposition import NMF
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
import numpy as np
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from textacy.preprocessing.replace import urls, hashtags, numbers, emails, emojis, currency_symbols
from typing import List
import itertools
from textacy.preprocessing.resources import RE_URL
from textacy.preprocessing.resources import RE_SHORT_URL


In [None]:
## Function to do TF-IDF Vectorization
def vectorizer(df:pd.DataFrame,min_df=0.01,max_df=0.7,ng=(1,1)):
  vectorizer = TfidfVectorizer(ngram_range=ng,\
                               min_df=min_df, max_df=max_df, stop_words="english",binary=True)
  df_1, df_terms = vectorizer.fit_transform(df.description), vectorizer.get_feature_names()
  df_tf_idf = pd.DataFrame(df_1.toarray(), columns=df_terms)
  return df_tf_idf,df_terms

In [None]:
## Function to get top tokens after topic modelling
## Used from yuchenny repository

def get_top_tf_idf_tokens_for_topic(H: np.array, feature_names: List[str], num_top_tokens: int = 5):
  """
  Uses the H matrix (K components x M original features) to identify for each
  topic the most frequent tokens.
  """
  for topic, vector in enumerate(H):
    print(f"TOPIC {topic}\n")
    total = vector.sum()
    top_scores = vector.argsort()[::-1][:num_top_tokens]
    token_names = list(map(lambda idx: feature_names[idx], top_scores))
    strengths = list(map(lambda idx: vector[idx] / total, top_scores))
    
    for strength, token_name in zip(strengths, token_names):
      print(f"\b{token_name} ({round(strength * 100, 1)}%)\n")
    print(f"=" * 50)

In [None]:
# Reading the subsetted LA Airbnb dataset, which was done in other ipynb (Data Separation.ipynb)

path = '/content/drive/MyDrive/NLP Project/ABB final/Airbnb_la_text_nlp.csv'
df_la_text = pd.read_csv(path)
df_la_text.head()

Unnamed: 0.1,Unnamed: 0,id,name,description,host_id,host_about,calendar_last_scraped
0,0,986942.0,Attic in Echo Park Hillside House,"Hi Travelers! We have a sweet, spacious attic ...",959023,Hi I'm Jonathan. I'm a parent & an artist. I'v...,2015-09-02
1,1,3249753.0,Art BNB @ Nomad Printmaking Studio,"Our Punk Shui industrial art space has cool, c...",12772297,My name is Damon Robinson. I am an LA based ar...,2015-09-02
2,2,3250095.0,Art BNB @ nomaD Printmaking Studio,"Our Punk Shui industrial art space has cool, c...",12772297,My name is Damon Robinson. I am an LA based ar...,2015-09-02
3,3,3250595.0,Art BNB @ nomAd Printmaking Studio,"Our Punk Shui industrial art space has cool, c...",12772297,My name is Damon Robinson. I am an LA based ar...,2015-09-02
4,4,1941493.0,1915 L.A. River Retreat - Two Bed,Enjoy city & mountain views from the upper uni...,10042199,"I love history and rivers. My building, which ...",2015-09-02


In [None]:
# Fetching the month and year only as the data was scraped once per month
df_la_text['my_scraped']= ([x.strftime('%Y-%m') for x in pd.to_datetime(df_la_text.calendar_last_scraped)])

In [None]:
# Labelling 1 and 0 as during covid and before covid respectively 
## (anything after Jan 2020, as after this date the regulations were being applied on Airbnbs) 
df_la_text['label'] = ([1 if x>'2020-01' else 0 for x in df_la_text['my_scraped']])

In [None]:
df_la_text.shape

(775528, 9)

In [None]:
(df_la_text['description'].isna()).sum()

9875

In [None]:
# Removing the empty descriptions

df_la_text_refined = df_la_text[df_la_text['description'].notna()]
df_la_text_refined.label.value_counts()

0    659057
1    106596
Name: label, dtype: int64

In [None]:
df_la_text_refined.shape

(765653, 9)

In [None]:
df_la_text_refined.nunique()

Unnamed: 0               765653
id                        69222
name                      99384
description              128217
host_id                   34308
host_about                32003
calendar_last_scraped        82
my_scraped                   38
label                         2
dtype: int64

In [None]:
# Separating out the pre covid and covid data 

pre_covid=df_la_text_refined[df_la_text_refined.label ==0]
covid = df_la_text_refined[df_la_text_refined.label ==1]

In [None]:
print(f'Pre covid shape: {pre_covid.shape}')
print(f'After/during covid shape: {covid.shape}')

Pre covid shape: (659057, 9)
After/during covid shape: (106596, 9)


In [None]:

# grouping together the relevant terms as key and values, to be used later file pre processing the data 

# Used https://github.com/L-Lewis/Airbnb-neural-network-price-prediction/blob/master/Airbnb-price-prediction.ipynb for amenity collection

reg_clean = {'good':["great","awesome","nice","best","perfect","good","amazing","spectacular","marvelous"],
 'check_in_24h': ['24-hour check-in'],
 'air_conditioning': ["Air conditioning", "Central air conditioning"],
 'high_end_electronics': ['Amazon Echo','Apple TV','Game console','Netflix','Projector and screen','Smart TV','HULU','hulu','netflix','prime video','hdtv','TV','tv'],
  'nature_and_views': ['Beach view','Beachfront','Lake access','Mountain view','Ski-in/Ski-out','Waterfront'],
  'amenities':['Elevator','Gym','gym','Exercise equipment','Bed linens','Coffee maker','Espresso machine','Dishwasher','Dryer','Washer'],
  'luxury':['Hot tub','Jetted tub','hot tub','Sauna','Pool','pool','Balcony','Patio','BBQ grill','Fire pit','Propane barbeque'],
  'child_friendly':['Family/kid friendly','Children','children'],
  'outdoor_space':['Garden','Outdoor','Sun loungers','Terrace'],
  'internet':['Internet','Pocket wifi','Wifi', 'wi fi', 'wi-fi','Wi fi', 'WI FI','WIFI', 'wi  fi'],
  'long_term_stays':['Long term stays allowed'],
  'pets_allowed':['Pets','pet','Cat(s)','Dog(s)'],
  'private_entrance': ['Private entrance'],
  'secure':['Safe','Security system'],
  'self_check_in':['Self check-in'],
  'smoking_allowed':['Smoking allowed'],
  'event_suitable':['Suitable for events'],
  'la': ['LA','los angeles','Los Angeles'],
  'covid':['COVID','COVID 19', 'COVID-19','Covid','covid-19','covid 19','cleaning','cleaned','safety','concerns','clean','disinfect','protection'],
  'location':['Santa Monica','santa monica','beverly hills','Beverly hills','Beverly', 'beverly-hills','marina del ray','universal studio','universal studios','Universal Studio','Universal Studios','beverly  hills','Marina del ray','Marina Del Ray'],
  }




**Working with listing data during covid phase** 

In [None]:
df1= covid.copy()

In [None]:
for k in reg_clean:
  df1['description'] = df1['description'].str.replace(r'\b(' + r'|'.join(reg_clean[k]) + r')\b\s*', str(k+' '),regex = True)
  # print(k)

In [None]:
# Applying textacy

df1["description"] = df1.description.\
  apply(urls).\
  apply(hashtags).\
  apply(numbers).\
  apply(currency_symbols).\
  apply(emojis).\
  apply(emails)
# df.Review_Text

In [None]:
df1.nunique()

Unnamed: 0               106596
id                        22171
name                      24467
description               25550
host_id                   12323
host_about                 9104
calendar_last_scraped        17
my_scraped                    6
label                         1
dtype: int64

In [None]:
covid_tf_idf, feature_names = vectorizer(df1)
covid_tf_idf.shape



(106596, 1053)

In [None]:
# Doing a similarity check

## Did this before as well, and whatever the findings were (for covid related terms, we put them in the regex_clean function)

similarity_matrix = pd.DataFrame(cosine_similarity(covid_tf_idf.T.values), 
             columns=feature_names, index=feature_names)

# unstack matrix into table
similarity_table = similarity_matrix.rename_axis(None).rename_axis(None, axis=1).stack().reset_index()

# rename columns
similarity_table.columns = ["word1", "word2", "similarity"]
similarity_table.shape



(1108809, 3)

In [None]:
similarity_table = similarity_table[similarity_table["similarity"] < 0.99]
similarity_table.shape

(1107756, 3)

In [None]:
similarity_table.sort_values(by="similarity", ascending=False).drop_duplicates(
    subset="similarity", keep="first").head(10)

Unnamed: 0,word1,word2,similarity
820508,sac,cul,0.952604
373341,foam,memory,0.937455
247166,del,rey,0.934803
883488,solo,adventurers,0.922108
362212,fi,wi,0.915662
1064837,walking,distance,0.909724
691467,pans,pots,0.908186
920127,steel,stainless,0.907602
665250,oaks,sherman,0.874134
597815,marina,rey,0.861106


In [None]:
# Now the terms like 'cleaning','cleaned','safety','concerns','clean','disinfect','protection' were replaced with covid, they are not here, 
## rest of the similar terms are different in context hence not replacing them

## we did the above thing so as to group together more relevant terms to covid. 
## This was done beacause the number of rows was not that much as compared to the original dataset

temp = similarity_table[similarity_table['word1'].str.contains('covid|pandemic|Covid|COVID|COVID 19')]
temp.sort_values(by="similarity", ascending=False).drop_duplicates(
    subset="similarity", keep="first").head(5)

Unnamed: 0,word1,word2,similarity
227942,covid,kitchen,0.295244
227840,covid,good,0.290599
228221,covid,room,0.290113
227857,covid,guests,0.276237
228156,covid,private,0.276163


In [None]:
## Topic Modelling on the most recent descriptions on the basis of listing id in the covid era

recent_desc= df1[df1.groupby(['id'])['calendar_last_scraped'].transform(max) == df1['calendar_last_scraped']]

covid_recent_tf_idf, feature_recent_covid = vectorizer(recent_desc,ng=(2,2))



In [None]:


nmf = NMF(n_components=15)
W = nmf.fit_transform(covid_recent_tf_idf)
H = nmf.components_
print(f"Original shape of X sports is {covid_recent_tf_idf.shape}")
print(f"Decomposed W sports matrix is {W.shape}")
print(f"Decomposed H sports matrix is {H.shape}")



Original shape of X sports is (22171, 518)
Decomposed W sports matrix is (22171, 15)
Decomposed H sports matrix is (15, 518)


In [None]:
print(f"Airbnb during the covid era description topics:\n\n")
get_top_tf_idf_tokens_for_topic(H, covid_recent_tf_idf.columns.tolist(), num_top_tokens=12)

Airbnb during the covid era description topics:


TOPIC 0

walking distance (12.3%)

coffee shops (4.0%)

restaurants bars (3.7%)

shops restaurants (3.2%)

good restaurants (2.4%)

grocery stores (2.3%)

los feliz (1.7%)

distance restaurants (1.6%)

restaurants coffee (1.6%)

bars restaurants (1.5%)

griffith park (1.4%)

distance good (1.4%)

TOPIC 1

solo adventurers (9.9%)

business travelers (9.2%)

place good (9.1%)

adventurers business (8.9%)

couples solo (8.6%)

good couples (8.5%)

ll love (7.3%)

love place (6.9%)

place close (6.6%)

travelers families (3.8%)

families kids (3.5%)

high ceilings (0.8%)

TOPIC 2

_number_ bedroom (13.7%)

bedroom _number_ (11.9%)

_number_ bath (8.0%)

_number_ bathroom (6.6%)

bed _number_ (2.4%)

_number_ bed (2.2%)

bathroom apartment (2.0%)

bedroom apartment (1.9%)

spacious _number_ (1.8%)

bath apartment (1.4%)

_number_ _number_ (1.3%)

beautiful _number_ (1.1%)

TOPIC 3

hidden airbnb (20.3%)



**Subsetting only on covid related descriptions**

In [None]:
temp = covid[covid['description'].str.contains('covid|pandemic|covid 19|Covid|COVID|COVID 19|safety|ensure|cleaned|cleaning|paramount|concerns|comfort|hulu|netflix')]


In [None]:
temp.nunique()

Unnamed: 0               30509
id                        6683
name                      7439
description               7793
host_id                   4041
host_about                3247
calendar_last_scraped       17
my_scraped                   6
label                        1
dtype: int64

In [None]:
temp.shape

(30509, 9)

In [None]:
for k in reg_clean:
  temp['description'] = temp['description'].str.replace(r'\b(' + r'|'.join(reg_clean[k]) + r')\b\s*', str(k+' '),regex = True)
  # print(k)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# Most recent description of respective listing

recent_desc_only_covid= temp[temp.groupby(['id'])['calendar_last_scraped'].transform(max) == temp['calendar_last_scraped']]

In [None]:
recent_desc_only_covid.shape

(6683, 9)

In [None]:
recent_desc_only_covid["description"] = recent_desc_only_covid.description.\
  apply(urls).\
  apply(hashtags).\
  apply(numbers).\
  apply(currency_symbols).\
  apply(emojis).\
  apply(emails)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
covid_only_tf_idf, feature_only_covid = vectorizer(recent_desc_only_covid,ng=(1,1))



In [85]:
# Tried various n grams from here on

nmf = NMF(n_components=10)
W = nmf.fit_transform(covid_only_tf_idf)
H = nmf.components_
print(f"Original shape of X sports is {covid_only_tf_idf.shape}")
print(f"Decomposed W sports matrix is {W.shape}")
print(f"Decomposed H sports matrix is {H.shape}")

print(f"Airbnb during the covid related description topics:\n\n")
get_top_tf_idf_tokens_for_topic(H, covid_only_tf_idf.columns.tolist(), 10)



Original shape of X sports is (6683, 1195)
Decomposed W sports matrix is (6683, 10)
Decomposed H sports matrix is (10, 1195)
Airbnb during the covid related description topics:


TOPIC 0

walking (1.0%)

distance (1.0%)

hollywood (1.0%)

close (1.0%)

restaurants (1.0%)

downtown (0.9%)

la (0.9%)

location (0.9%)

away (0.8%)

neighborhood (0.8%)

TOPIC 1

high_end_electronics (1.2%)

queen (1.2%)

coffee (1.1%)

bed (1.1%)

microwave (1.0%)

size (0.9%)

dryer (0.9%)

towels (0.9%)

internet (0.9%)

equipped (0.9%)

TOPIC 2

charged (1.8%)

tcl (1.8%)

lost (1.8%)

dolby (1.8%)

checked (1.8%)

laws (1.8%)

load (1.8%)

sqft (1.8%)

allows (1.7%)

1st (1.7%)

TOPIC 3

house (1.0%)

home (0.9%)

private (0.7%)

dining (0.7%)

living (0.7%)

large (0.7%)

outdoor (0.7%)

room (0.7%)

beautiful (0.7%)

backyard (0.6%)

TOPIC 4

review (1.8%)

inquiry (1.8%)

understanding (1.8%)

rules (1.8%)

courtesy (1.8%)

camera (1.8%)

interested (1.

In [None]:
covid_only_tf_idf_2, feature_only_covid_2 = vectorizer(recent_desc_only_covid,ng=(2,2))



In [86]:
nmf = NMF(n_components=10)
W = nmf.fit_transform(covid_only_tf_idf_2)
H = nmf.components_
print(f"Original shape of X sports is {covid_only_tf_idf_2.shape}")
print(f"Decomposed W sports matrix is {W.shape}")
print(f"Decomposed H sports matrix is {H.shape}")


print(f"Airbnb description topics for only covid related descriptions:\n\n")
get_top_tf_idf_tokens_for_topic(H, covid_only_tf_idf_2.columns.tolist(), 10)



Original shape of X sports is (6683, 683)
Decomposed W sports matrix is (6683, 10)
Decomposed H sports matrix is (10, 683)
Airbnb description topics for only covid related descriptions:


TOPIC 0

size bed (2.6%)

living room (2.6%)

queen size (2.6%)

comfortable queen (1.3%)

fully equipped (1.2%)

dining room (1.2%)

private room (1.1%)

queen bed (1.0%)

equipped kitchen (1.0%)

kitchen living (1.0%)

TOPIC 1

walking distance (2.1%)

_number_ minutes (1.8%)

location hills (1.6%)

_number_ minute (1.3%)

_number_ _number_ (1.3%)

venice beach (1.2%)

centrally located (1.2%)

street parking (1.0%)

minute walk (0.9%)

minutes away (0.9%)

TOPIC 2

chinese theater (3.7%)

additional guest (3.7%)

beautifully renovated (3.7%)

professionally managed (3.7%)

_cur__number_ fee (3.7%)

guests staying (3.7%)

check _number_ (3.7%)

_cur__number_ _cur__number_ (3.7%)

theater _number_ (3.7%)

min hollywood (3.6%)

TOPIC 3

walk fame (3.6%)

hollywood walk 

In [None]:
# Tried running the follwoing, due top memory limitation it didn't run
# for i in range(2, 5):

#     vectorizer = TfidfVectorizer(ngram_range=(i,i))

#     X = vectorizer.fit_transform(recent_desc_only_covid.description)
#     terms = vectorizer.get_feature_names()
#     tf_idf = pd.DataFrame(X.toarray().transpose(), index=terms)

#     tf_idf = tf_idf.sum(axis=1)
#     score = pd.DataFrame(tf_idf, columns=["score"])
#     score.sort_values(by="score", ascending=False, inplace=True)
#     print("These are the 5 most common n-grams of size %d"%i)
#     print("{}\n".format(score.head(5)))

In [None]:
covid_only_tf_idf_5, feature_only_covid_5 = vectorizer(recent_desc_only_covid,ng=(1,5))



In [87]:
nmf = NMF(n_components=10)
W = nmf.fit_transform(covid_only_tf_idf_5)
H = nmf.components_
print(f"Original shape of X sports is {covid_only_tf_idf_5.shape}")
print(f"Decomposed W sports matrix is {W.shape}")
print(f"Decomposed H sports matrix is {H.shape}")

print(f"Airbnb description topics for only covid related descriptions::\n\n")
get_top_tf_idf_tokens_for_topic(H, covid_only_tf_idf_5.columns.tolist(), 10)



Original shape of X sports is (6683, 2029)
Decomposed W sports matrix is (6683, 10)
Decomposed H sports matrix is (10, 2029)
Airbnb description topics for only covid related descriptions::


TOPIC 0

queen (1.0%)

bed (0.9%)

size (0.9%)

queen size (0.7%)

size bed (0.7%)

bathroom (0.6%)

high_end_electronics (0.6%)

microwave (0.6%)

queen size bed (0.6%)

towels (0.6%)

TOPIC 1

walking (0.7%)

distance (0.7%)

walking distance (0.6%)

restaurants (0.6%)

location (0.6%)

la (0.6%)

downtown (0.6%)

close (0.6%)

hollywood (0.6%)

minutes (0.5%)

TOPIC 2

check _number_ _number_ (1.2%)

_cur__number_ fee (1.2%)

tcl (1.2%)

professionally managed (1.2%)

charged (1.2%)

guests staying (1.2%)

beautifully renovated (1.2%)

dolby theater (1.2%)

additional guest (1.2%)

lost (1.2%)

TOPIC 3

review house (1.2%)

review house rules (1.2%)

house rules booking (1.2%)

rules booking (1.2%)

mins drive (1.2%)

_number_ mins drive (1.2%)

essentials pr



**Topic Modelling on Pre covid data**

In [None]:
pre_covid.nunique()

Unnamed: 0               659057
id                        66188
name                      93080
description              119423
host_id                   33445
host_about                30617
calendar_last_scraped        65
my_scraped                   32
label                         1
dtype: int64

In [None]:
pre_covid.shape

(659057, 9)

In [None]:
# Most recent description of respective listing

recent_desc_pre_covid= pre_covid[pre_covid.groupby(['id'])['calendar_last_scraped'].transform(max) == pre_covid['calendar_last_scraped']]

In [None]:
recent_desc_pre_covid.shape

(66188, 9)

In [None]:
for k in reg_clean:
  recent_desc_pre_covid['description'] = recent_desc_pre_covid['description'].str.replace(r'\b(' + r'|'.join(reg_clean[k]) + r')\b\s*', str(k+' '),regex = True)
  # print(k)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
recent_desc_pre_covid["description"] = recent_desc_pre_covid.description.\
  apply(urls).\
  apply(hashtags).\
  apply(numbers).\
  apply(currency_symbols).\
  apply(emojis).\
  apply(emails)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
pre_covid_tf_idf, feature_pre_covid = vectorizer(recent_desc_pre_covid,ng=(2,2))



In [88]:
# Gonna try multiple n grams

nmf = NMF(n_components=10)
W = nmf.fit_transform(pre_covid_tf_idf)
H = nmf.components_
print(f"Original shape of X sports is {pre_covid_tf_idf.shape}")
print(f"Decomposed W sports matrix is {W.shape}")
print(f"Decomposed H sports matrix is {H.shape}")


print(f"Airbnb before the covid era description topics:\n\n")
get_top_tf_idf_tokens_for_topic(H, pre_covid_tf_idf.columns.tolist(), 10)



Original shape of X sports is (66188, 505)
Decomposed W sports matrix is (66188, 10)
Decomposed H sports matrix is (10, 505)
Airbnb before the covid era description topics:


TOPIC 0

walking distance (10.0%)

restaurants bars (3.1%)

coffee shops (2.7%)

shops restaurants (2.2%)

good restaurants (2.0%)

grocery stores (1.6%)

los feliz (1.6%)

bars restaurants (1.5%)

echo park (1.5%)

west hollywood (1.4%)

TOPIC 1

solo adventurers (9.4%)

place good (9.3%)

business travelers (8.9%)

good couples (8.6%)

adventurers business (8.6%)

couples solo (8.4%)

ll love (7.9%)

love place (7.7%)

place close (7.6%)

travelers families (4.0%)

TOPIC 2

living room (9.5%)

room kitchen (3.6%)

kitchen living (3.5%)

dining room (3.5%)

private room (3.1%)

private bathroom (2.5%)

room dining (2.3%)

private bedroom (2.3%)

bedroom bathroom (1.7%)

access kitchen (1.6%)

TOPIC 3

walk fame (9.1%)

hollywood walk (6.9%)

heart hollywood (5.4%)

hollywood sign

In [None]:
pre_covid_tf_idf_5, feature_pre_covid_5 = vectorizer(recent_desc_pre_covid,ng=(1,5))



In [89]:
nmf = NMF(n_components=10)
W = nmf.fit_transform(pre_covid_tf_idf_5)
H = nmf.components_
print(f"Original shape of X sports is {pre_covid_tf_idf_5.shape}")
print(f"Decomposed W sports matrix is {W.shape}")
print(f"Decomposed H sports matrix is {H.shape}")


print(f"Airbnb before the covid era description topics:\n\n")
get_top_tf_idf_tokens_for_topic(H, pre_covid_tf_idf_5.columns.tolist(), 10)



Original shape of X sports is (66188, 1588)
Decomposed W sports matrix is (66188, 10)
Decomposed H sports matrix is (10, 1588)
Airbnb before the covid era description topics:


TOPIC 0

home (0.8%)

open (0.7%)

bedrooms (0.7%)

dining (0.7%)

large (0.7%)

house (0.6%)

living (0.6%)

outdoor (0.6%)

views (0.6%)

beautiful (0.6%)

TOPIC 1

solo adventurers (2.7%)

adventurers (2.7%)

solo (2.6%)

place good (2.6%)

business travelers (2.5%)

adventurers business (2.5%)

solo adventurers business (2.5%)

adventurers business travelers (2.5%)

good couples (2.5%)

solo adventurers business travelers (2.5%)

TOPIC 2

queen (1.2%)

high_end_electronics (1.1%)

bed (1.1%)

size (1.1%)

size bed (0.8%)

queen size (0.8%)

internet (0.8%)

microwave (0.7%)

fully (0.7%)

dryer (0.7%)

TOPIC 3

walking distance (2.3%)

distance (2.3%)

walking (2.3%)

restaurants (1.8%)

shops (1.4%)

bars (1.4%)

restaurants bars (0.8%)

park (0.8%)

stores (0.8%)

gr