-
Notifications
You must be signed in to change notification settings - Fork 0
/
RcmdrMarkdown.Rmd
73 lines (61 loc) · 2.38 KB
/
RcmdrMarkdown.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
---
title: "Replace with Main Title"
author: "Your Name"
date: "2019-05-10 13:40:02"
---
```{r echo=FALSE, message=FALSE}
# include this code chunk as-is to set options
knitr::opts_chunk$set(comment=NA, prompt=TRUE)
library(Rcmdr)
library(car)
library(RcmdrMisc)
```
```{r echo=FALSE}
# include this code chunk as-is to enable 3D graphs
library(rgl)
knitr::knit_hooks$set(webgl = hook_webgl)
```
```{r}
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
```
rm(corpus, corpusVars)
corpus <- VCorpus(DirSource("d:/Users/Felipe/Desktop/livro", encoding="ISO-8859-1"),
readerControl=list(language="pt"))
corpusVars <- data.frame(var1=factor(rep("", length(corpus))), row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x) gsub("\\p{P}|\\p{S}|\\p{Z}|\\p{C}", " ", x,
perl=TRUE)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE, wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm), "Occurrences"=col_sums(dtm),
"Stopword"=ifelse(colnames(dtm) %in% stopwords("pt"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("pt")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "pt"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <- c(lowercase=TRUE,
punctuation=TRUE, digits=TRUE, stopwords=TRUE, stemming=FALSE, customStemming=FALSE, twitter=FALSE,
removeHashtags=NA, removeNames=NA)
# corpus
# dtm
freqTerms <- frequentTerms(dtm, NULL, 100)
attr(freqTerms, "title") <- "Most frequent terms by Document"
freqTerms
dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict
dict2=subset(dict, dict$Stopword != "Stopword")
dict2=as.data.frame(dict2[1])
dict2$words=rownames(dict2)