-
Notifications
You must be signed in to change notification settings - Fork 0
/
bag_of_words_analysis.R
77 lines (66 loc) · 2.92 KB
/
bag_of_words_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# The processed textual data of airbnb can now be use for getting insights
# For example,find dominant words per aggregation category
# In this case, the aggregation category is neighborhood in Antwerp
# 1. Get the top 10 tokens per neighborhood (top 5) for the review
# Get the top 5 neighborhood name
top_5_neighborhood <- all_data_df %>%
select(listing_id,neighbourhood_cleansed) %>%
unique(.) %>%
group_by(neighbourhood_cleansed) %>%
summarise(total =n()) %>%
arrange(desc(total)) %>%
top_n(5)
# Get listing_id and tokens in comments for these neighborhoods
top_5_neighborhood_listing <- all_data_df %>%
select(listing_id,neighbourhood_cleansed) %>%
filter(neighbourhood_cleansed %in% top_5_neighborhood$neighbourhood_cleansed) %>%
unique(.)
neighbourhood_tokens_comments<- tokens_all_comments_tf_idf %>%
right_join(top_5_neighborhood_listing) %>%
group_by(neighbourhood_cleansed,word) %>%
summarise(total=sum(n)) %>%
arrange(desc(total))
# Get the top 10 tokens per neighborhood for review
for(neighb in 1:nrow(top_5_neighborhood)){
print(paste0("For neighbourhood: ",top_5_neighborhood$neighbourhood_cleansed[neighb]))
toprint <- neighbourhood_tokens_comments %>% ungroup() %>%
filter(neighbourhood_cleansed == top_5_neighborhood$neighbourhood_cleansed[neighb]) %>%
top_n(10,total) %>%
select(-total) %>%
mutate(rank = row_number())
print(toprint)
}
# 2. Get the top 10 tokens per neighborhood (top 5) for description
# Get tokens in description for these neighborhoods
neighbourhood_tokens_description <- tokens_all_description_tf_idf %>%
right_join(top_5_neighborhood_listing) %>%
group_by(neighbourhood_cleansed,word) %>%
summarise(total=sum(n)) %>%
arrange(desc(total))
# Get the top 10 tokens per neighborhood for description
for(neighb in 1:nrow(top_5_neighborhood)){
print(paste0("For neighbourhood: ",top_5_neighborhood$neighbourhood_cleansed[neighb]))
toprint <- neighbourhood_tokens_description %>% ungroup() %>%
filter(neighbourhood_cleansed == top_5_neighborhood$neighbourhood_cleansed[neighb]) %>%
top_n(10,total) %>%
select(-total) %>%
mutate(rank = row_number())
print(toprint)
}
# 3. Get the top 10 tokens per neighborhood (top 5) for transport
# Get tokens in transit for these neighborhoods
neighbourhood_tokens_transport <- tokens_transport_tf_idf %>%
right_join(top_5_neighborhood_listing) %>%
group_by(neighbourhood_cleansed,word) %>%
summarise(total=sum(n)) %>%
arrange(desc(total))
# Get the top 10 tokens per neighborhood for transport
for(neighb in 1:nrow(top_5_neighborhood)){
print(paste0("For neighbourhood: ",top_5_neighborhood$neighbourhood_cleansed[neighb]))
toprint <- neighbourhood_tokens_transport %>% ungroup() %>%
filter(neighbourhood_cleansed == top_5_neighborhood$neighbourhood_cleansed[neighb]) %>%
top_n(10,total) %>%
select(-total) %>%
mutate(rank = row_number())
print(toprint)
}