RcmdrMarkdown.Rmd

---
title: "Replace with Main Title"
author: "Your Name"
date: "2019-05-10 13:40:02"
---


```{r echo=FALSE, message=FALSE}
# include this code chunk as-is to set options
knitr::opts_chunk$set(comment=NA, prompt=TRUE)
library(Rcmdr)
library(car)
library(RcmdrMisc)
```


```{r echo=FALSE}
# include this code chunk as-is to enable 3D graphs
library(rgl)
knitr::knit_hooks$set(webgl = hook_webgl)
```


```{r}
# Prefer fixed to scientific notation
options(scipen=5)

# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)

# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1, 
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
```
rm(corpus, corpusVars)
corpus <- VCorpus(DirSource("d:/Users/Felipe/Desktop/livro", encoding="ISO-8859-1"), 
  readerControl=list(language="pt"))
corpusVars <- data.frame(var1=factor(rep("", length(corpus))), row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x) gsub("\\p{P}|\\p{S}|\\p{Z}|\\p{C}", " ", x,
   perl=TRUE)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE, wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm), "Occurrences"=col_sums(dtm), 
  "Stopword"=ifelse(colnames(dtm) %in% stopwords("pt"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("pt")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "pt"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <- c(lowercase=TRUE, 
  punctuation=TRUE, digits=TRUE, stopwords=TRUE, stemming=FALSE, customStemming=FALSE, twitter=FALSE, 
  removeHashtags=NA, removeNames=NA)
# corpus
# dtm

freqTerms <- frequentTerms(dtm, NULL, 100)
attr(freqTerms, "title") <- "Most frequent terms by Document"
freqTerms

dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict

dict2=subset(dict, dict$Stopword != "Stopword")
dict2=as.data.frame(dict2[1])
dict2$words=rownames(dict2)