Permalink
Find file Copy path
89 lines (59 sloc) 1.84 KB
title weight draft
Similarity between Twitter users
10
false
require(quanteda)
require(readtext)

Import Tweets from JSON (.json) file. twitter.json is located in data directory of this tutorial package.

twitter_data <- readtext("content/data/twitter.json", source = "twitter")

Construct a corpus of Tweets.

tweet_corp <- corpus(twitter_data)

Construct a document-feature matrix removing tags and links.

tweet_dfm <- dfm(tweet_corp,
                 remove_punct = TRUE, remove_url = TRUE,
                 remove = c('*.tt', '*.uk', '*.com', 'rt', '#*', '@*')) %>% 
             dfm_remove(stopwords('en'))

ndoc(tweet_dfm)
## [1] 7504
topfeatures(tweet_dfm)
##          vote conservatives        labour         today         share 
##          1817           929           676           666           647 
##       britain          find        fairer        voting      tomorrow 
##           625           613           571           559           548

Group documents by usernames.

user_dfm <- dfm_group(tweet_dfm, groups = docvars(tweet_dfm, 'screen_name'))
ndoc(user_dfm)
## [1] 5061

Remove rare (less than 10 times) and short (one character) features, and convert count to proportion using dfm_weight().

prop_user_dfm <- user_dfm %>% 
                 dfm_select(min_nchar = 2) %>% 
                 dfm_trim(min_termfreq = 10) %>% 
                 dfm_weight('prop')

Calculate user-user similarity using textstat_dist().

user_dist <- textstat_dist(prop_user_dfm)
user_clust <- hclust(user_dist)
plot(user_clust, labels = FALSE)