In [1]:
library(janeaustenr)
library(dplyr)
library(sparklyr)



Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
# Set memory allocation for whole local Spark instance
Sys.setenv("SPARK_MEM" = "8G")

# Set driver and executor memory allocations
config <- spark_config()
config$spark.driver.memory <- "4G"
config$spark.executor.memory <- "1G"
config$`sparklyr.shell.driver-memory` <- "4G"
config$`sparklyr.shell.executor-memory` <- "4G"
config$`spark.yarn.executor.memoryOverhead` <- "1g"

sc <- spark_connect(master = "local", config = config)

In [3]:
lines_tbl <- sdf_copy_to(sc,
                         austen_books()[c(1:30),],
                         name = "lines_tbl",
                         overwrite = TRUE)

In [4]:
lines_tbl

# Source:   table<lines_tbl> [?? x 2]
# Database: spark_connection
   text                  book               
   <chr>                 <chr>              
 1 SENSE AND SENSIBILITY Sense & Sensibility
 2 ""                    Sense & Sensibility
 3 by Jane Austen        Sense & Sensibility
 4 ""                    Sense & Sensibility
 5 (1811)                Sense & Sensibility
 6 ""                    Sense & Sensibility
 7 ""                    Sense & Sensibility
 8 ""                    Sense & Sensibility
 9 ""                    Sense & Sensibility
10 CHAPTER 1             Sense & Sensibility
# ... with more rows

In [5]:
# transform the data in a tidy form
lines_tbl_tidy <- lines_tbl %>%
  ft_tokenizer(input_col = "text",
               output_col = "word_list") %>%
  ft_stop_words_remover(input_col = "word_list",
                        output_col = "wo_stop_words") %>%
  mutate(text = explode(wo_stop_words)) %>%
  filter(text != "") %>%
  select(text, book)

In [12]:
head(lines_tbl_tidy)

# Source:   lazy query [?? x 2]
# Database: spark_connection
  text        book               
  <chr>       <chr>              
1 sense       Sense & Sensibility
2 sensibility Sense & Sensibility
3 jane        Sense & Sensibility
4 austen      Sense & Sensibility
5 (1811)      Sense & Sensibility
6 chapter     Sense & Sensibility

In [38]:
lda_model <- lines_tbl_tidy %>% ft_tokenizer(input_col = "text", output_col = "tokens") %>%
  ft_count_vectorizer("tokens", "features") %>%
  ml_lda(k = 8)

In [39]:
# vocabulary and topics
tidy(lda_model)

ERROR: Error: No tidy method for objects of class ml_lda_model


In [17]:
austen_books <- austen_books()
books_tbl <- sdf_copy_to(sc, austen_books, overwrite = TRUE)
first_tbl <- books_tbl %>% filter(nchar(text) > 0) %>% head(100) 

In [18]:
head(books_tbl)

# Source:   lazy query [?? x 2]
# Database: spark_connection
  text                  book               
  <chr>                 <chr>              
1 SENSE AND SENSIBILITY Sense & Sensibility
2 ""                    Sense & Sensibility
3 by Jane Austen        Sense & Sensibility
4 ""                    Sense & Sensibility
5 (1811)                Sense & Sensibility
6 ""                    Sense & Sensibility

In [19]:
features_tbl <- first_tbl %>% ft_tokenizer(input_col = "text", output_col = "tokens") %>%
  ft_count_vectorizer("tokens", "features") 

In [40]:
features_tbl

# Source:   table<sparklyr_tmp_c904a0019e1> [?? x 4]
# Database: spark_connection
   text                                       book          tokens    features 
   <chr>                                      <chr>         <list>    <list>   
 1 SENSE AND SENSIBILITY                      Sense & Sens… <list [3… <dbl [46…
 2 by Jane Austen                             Sense & Sens… <list [3… <dbl [46…
 3 (1811)                                     Sense & Sens… <list [1… <dbl [46…
 4 CHAPTER 1                                  Sense & Sens… <list [2… <dbl [46…
 5 The family of Dashwood had long been sett… Sense & Sens… <list [1… <dbl [46…
 6 was large, and their residence was at Nor… Sense & Sens… <list [1… <dbl [46…
 7 their property, where, for many generatio… Sense & Sens… <list [1… <dbl [46…
 8 respectable a manner as to engage the gen… Sense & Sens… <list [1… <dbl [46…
 9 surrounding acquaintance.  The late owner… Sense & Sens… <list [1… <dbl [46…
10 man, who lived to a very advanced a

In [35]:
features_tbl %>% select('features')

# Source:   lazy query [?? x 1]
# Database: spark_connection
   features   
   <list>     
 1 <dbl [463]>
 2 <dbl [463]>
 3 <dbl [463]>
 4 <dbl [463]>
 5 <dbl [463]>
 6 <dbl [463]>
 7 <dbl [463]>
 8 <dbl [463]>
 9 <dbl [463]>
10 <dbl [463]>
# ... with more rows

In [36]:
features_tbl %>% select('tokens')

# Source:   lazy query [?? x 1]
# Database: spark_connection
   tokens     
   <list>     
 1 <list [3]> 
 2 <list [3]> 
 3 <list [1]> 
 4 <list [2]> 
 5 <list [13]>
 6 <list [13]>
 7 <list [11]>
 8 <list [12]>
 9 <list [12]>
10 <list [15]>
# ... with more rows

In [29]:
model <-  features_tbl %>% ml_lda(features_col = "features", k = 4)

In [25]:
ml_describe_topics(model, max_terms_per_topic = 4)

# Source:   table<sparklyr_tmp_c906ef32ee5> [?? x 3]
# Database: spark_connection
  topic termIndices termWeights
  <int> <list>      <list>     
1     0 <list [4]>  <list [4]> 
2     1 <list [4]>  <list [4]> 
3     2 <list [4]>  <list [4]> 
4     3 <list [4]>  <list [4]> 

In [26]:
ml_topics_matrix(model)

0,1,2,3
25.428380,0.6507849,1.7032633,0.6392484
27.074551,1.0265282,0.7131571,0.8225259
31.374783,0.6767817,0.5746773,1.0124210
17.947976,0.6334923,1.0516655,1.0166032
19.419317,0.5864663,0.7987232,0.6895287
9.118162,0.6843088,4.0242542,0.6988961
7.505302,1.6644411,1.6320646,0.9787012
10.537501,1.0506819,0.9983140,0.7116161
12.488327,0.7134782,2.0064575,0.7665760
10.163410,0.9749987,0.7729269,0.6839072


In [31]:
dim(ml_topics_matrix(model))

In [None]:
features_tbl <- first_tbl %>% ft_tokenizer(input_col = "text", output_col = "tokens") %>%
  ft_count_vectorizer("tokens", "features") %>%
  ml_lda(features_col = "features", k = 4)