In [9]:
library(tidytext)
library(stringr)
library(tidyverse)
library(tibble)
library(ggplot2)
library(lubridate)
library(SnowballC)
library(RSQLite)

In [2]:
system('head -n 5 ../../data/eth_1aug15_1aug18/0_sorted.csv', intern = TRUE)

In [3]:
system('tail -n 5 ../../data/eth_1aug15_1aug18/0_sorted.csv', intern = TRUE)

In [11]:
con = dbConnect(SQLite(), dbname="../../data/twitter.db")

In [None]:
tweets_df <- read.csv('../../data/eth_1aug15_1aug18/0_sorted.csv', nrow=1484777)
head(tweets_df)

In [None]:
remove_reg <- "&amp;|&lt;|&gt;"
unnest_reg  <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

tidy_tweets <- tweets_df %>% 

filter(!str_detect(Tweet.Text, "^RT")) %>%
mutate(text = str_remove_all(Tweet.Text, remove_reg)) %>%

unnest_tokens(word, text, token = "tweets") %>%
mutate(word=wordStem(word, language = 'english')) %>%

filter(!word %in% stop_words$word,
     !word %in% str_remove_all(stop_words$word, "'"),
     str_detect(word, "[a-z]"))

In [None]:
tidy_tweets$DateTime <- tidy_tweets$DateTime %>% as.Date(format = '%Y-%m-%d')
str(tidy_tweets)

In [None]:
summary(tidy_tweets$DateTime)

In [None]:
tidy_tweets %>% count(DateTime) %>% 
ggplot() + aes(x=DateTime, y=n) + geom_line() +geom_point() + ylab('No of Tweets')


Calculate frequency of words against each month

In [None]:
tidy_tweets_freq <- tidy_tweets %>% mutate(week = week(DateTime)) %>% group_by(week) %>% count(word, sort=TRUE) %>% 
left_join(tidy_tweets %>% mutate(week = week(DateTime)) %>% group_by(week) %>% summarize(total=n())) %>% 
mutate(freq = n/total) %>% select(week, word, freq, n, total)

In [None]:
tidy_tweets_freq %<>% arrange(desc(n))  %>% group_by(week) %>% top_n(freq, n=5) 
tidy_tweets_freq %>% arrange(week)

Let's convert words to columns (dimensions)

In [None]:
tidy_tweets_plot <- tidy_tweets_freq %>% select(-n,-total) %>% spread(week, freq, fill = 0) 

In [None]:
saveRDS(tidy_tweets_plot, file = 'tidy_tweets_per_week_eth.rds')
tidy_tweets_plot

In [None]:
tidy_tweets_plot %>% top_n(n = 1000) %>% ggplot() + aes('29','30') + geom_jitter() + geom_text(aes(label = word), check_overlap = TRUE, vjust = 0)