diff --git a/NEWS b/NEWS index 67a82e8..4a58aa4 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,8 @@ -stopwords v0.9 (Release date: TBC) +stopwords v0.9.9000 +============== +* Added Gujarati to misc stopwords. + +stopwords v0.9 (Release date: 2017-12-14) ============== Changes: diff --git a/R/data.r b/R/data.r index 9e5445c..f18a9f3 100644 --- a/R/data.r +++ b/R/data.r @@ -33,12 +33,18 @@ #' @section Usage: #' \code{stopwords(language, source = "misc")} #' @source -#' The Arabic stopwords come from \url{https://sites.google.com/site/kevinbouge/stopwords-lists}. +#' The Arabic stopwords come from +#' \url{https://sites.google.com/site/kevinbouge/stopwords-lists}. #' -#' The Catalan stopwords come from \url{http://latel.upf.edu/morgana/altres/pub/ca_stop.htm}. +#' The Catalan stopwords come from +#' \url{http://latel.upf.edu/morgana/altres/pub/ca_stop.htm}. #' #' The Greek stopwords were supplied by Carsten Schwemmer (see -#' \url{https://github.com/kbenoit/quanteda/issues/282}). +#' \url{https://github.com/quanteda/quanteda/issues/282}). +#' +#' The Gujarati stopwords are taken from +#' \url{https://github.com/gujarati-ir/Gujarati-Stop-Words} and modified by +#' Chandrakant Bhogayata. #' #' The Chinese stopwords are taken from the #' \href{http://www.baiduguide.com/baidu-stopwords/}{Baidu stopword list}. diff --git a/README.Rmd b/README.Rmd index 6691102..b05196f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -18,7 +18,7 @@ knitr::opts_chunk$set( [![Downloads](https://cranlogs.r-pkg.org/badges/stopwords)](https://CRAN.R-project.org/package=stopwords) [![Total Downloads](https://cranlogs.r-pkg.org/badges/grand-total/stopwords?color=orange)](https://CRAN.R-project.org/package=stopwords) -R package providing "one-stop shopping" (or should that be "one-shop stopping"?) for stopword lists in R, for multiple languages and sources. No longer should text analysis or NLP packages bake in their own stopword lists or functions, since this package can accomodate them all, and is easily extended. +R package providing "one-stop shopping" (or should that be "one-shop stopping"?) for stopword lists in R, for multiple languages and sources. No longer should text analysis or NLP packages bake in their own stopword lists or functions, since this package can accommodate them all, and is easily extended. Created by [David Muhr](https://github.com/davnn), and extended in cooperation with [Kenneth Benoit](https://github.com/kbenoit) and [Kohei Watanabe](https://github.com/koheiw). @@ -41,7 +41,7 @@ head(stopwords::stopwords("de", source = "snowball"), 20) head(stopwords::stopwords("de", source = "stopwords-iso"), 20) ``` -For compability with the former `quanteda::stopwords()`: +For compatibility with the former `quanteda::stopwords()`: ```{r} head(stopwords::stopwords("german"), 20) @@ -86,6 +86,7 @@ The following languages are currently available: | Galician | gl | ✔ | | | | | German | de | ✔ | ✔ | | | | Greek | el | ✔ | | | ✔ | +| Gujarati | gu | | | | ✔ | | Hausa | ha | ✔ | | | | | Hebrew | he | ✔ | | | | | Hindi | hi | ✔ | | | | diff --git a/README.md b/README.md index af6555e..475ae31 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ Downloads](https://cranlogs.r-pkg.org/badges/grand-total/stopwords?color=orange) R package providing “one-stop shopping” (or should that be “one-shop stopping”?) for stopword lists in R, for multiple languages and sources. No longer should text analysis or NLP packages bake in their own -stopword lists or functions, since this package can accomodate them all, -and is easily extended. +stopword lists or functions, since this package can accommodate them +all, and is easily extended. Created by [David Muhr](https://github.com/davnn), and extended in cooperation with [Kenneth Benoit](https://github.com/kbenoit) and [Kohei @@ -47,7 +47,7 @@ head(stopwords::stopwords("de", source = "stopwords-iso"), 20) ## [16] "allerdings" "alles" "allgemeinen" "als" "also" ``` -For compability with the former `quanteda::stopwords()`: +For compatibility with the former `quanteda::stopwords()`: ``` r head(stopwords::stopwords("german"), 20) @@ -104,6 +104,7 @@ available: | Galician | gl | ✔ | | | | | German | de | ✔ | ✔ | | | | Greek | el | ✔ | | | ✔ | +| Gujarati | gu | | | | ✔ | | Hausa | ha | ✔ | | | | | Hebrew | he | ✔ | | | | | Hindi | hi | ✔ | | | | diff --git a/data-raw/other/add-gujarati.R b/data-raw/other/add-gujarati.R new file mode 100644 index 0000000..5c79ac0 --- /dev/null +++ b/data-raw/other/add-gujarati.R @@ -0,0 +1,8 @@ +# read Gujarati stopwords +stopwords_gujarati <- readLines("data-raw/other/gujarati-stopwords.txt", + encoding = "UTF-8") + +# add Gujarati to misc source +data_stopwords_misc$gu <- stopwords_gujarati +data_stopwords_misc <- data_stopwords_misc[order(names(data_stopwords_misc))] +usethis::use_data(data_stopwords_misc, overwrite = TRUE) diff --git a/data-raw/other/gujarati-stopwords.txt b/data-raw/other/gujarati-stopwords.txt new file mode 100644 index 0000000..dd10cc2 --- /dev/null +++ b/data-raw/other/gujarati-stopwords.txt @@ -0,0 +1,210 @@ +અથવા +અને +અમને +અમારું +અમે +અહીં +આ +આગળ +આથી +આનું +આને +આપણને +આપણું +આપણે +આપી +આવી +આવે +ઉપર +ઊંચે +ઊભું +એ +એક +એના +એનાં +એની +એનું +એને +એનો +એમ +એવા +એવાં +એવી +એવું +એવો +ઓછું +અંગે +અંદર +કઈ +કયું +કયો +કરવું +કરતાં +કરી +કરીએ +કરું +કરે +કરેલું +કર્યા +કર્યાં +કર્યું +કર્યો +કંઈક +કાંઈ +કે +કેટલું +કેમ +કેવી +કેવું +કોઈ +કોઈક +કોણ +કોણે +કોને +ક્યારે +ક્યાં +ખૂબ +ગઈ +ગયા +ગયાં +ગયું +ગયો +ઘણું +છ +છતાં +છીએ +છું +છે +છેક +છો +જ +જાય +જી +જે +જેટલું +જેને +જેમ +જેવી +જેવું +જેવો +જો +જોઈએ +જ્યારે +જ્યાં +ઝાઝું +તને +તમને +તમારું +તમે +તારાથી +તારામાં +તારું +તું +તે +તેઓ +તેથી +તેણે +તેના +તેની +તેનું +તેને +તેમ +તેમનું +તેમને +તેવી +તેવું +તેં +તો +ત્યારે +ત્યાં +થઈ +થઈએ +થતા +થતાં +થતી +થતું +થતો +થયા +થયાં +થયું +થયો +થયેલું +થવું +થાઉં +થાઓ +થાય +થોડું +દરેક +ન +નથી +નહિ +નહીં +નં. +ના +નીચે +ને +પછી +પણ +પર +પરંતુ +પહેલાં +પાછળ +પાસે +પોતાનું +પ્રત્યેક +ફક્ત +ફરી +ફરીથી +બધા +બધું +બની +બહાર +બહુ +બંને +બાદ +બે +મને +મા +માટે +માત્ર +મારું +મૂકવું +મૂકી +મૂક્યા +મૂક્યાં +મૂક્યું +મેં +રહી +રહે +રહેવું +રહ્યા +રહ્યાં +રહ્યો +રીતે +રૂ. +લેતા +લેતું +લેવા +વગેરે +વધુ +શકે +શા +શું +સરખું +સામે +સુધી +હતા +હતાં +હતી +હતું +હશે +હશો +હવે +હા +હું +હો +હોઈ +હોઈશ +હોઈશું +હોય +હોવા diff --git a/data/data_stopwords_misc.rda b/data/data_stopwords_misc.rda index d1a850d..4d65a8a 100644 Binary files a/data/data_stopwords_misc.rda and b/data/data_stopwords_misc.rda differ diff --git a/man/data_stopwords_misc.Rd b/man/data_stopwords_misc.Rd index 3609da7..2998f4a 100644 --- a/man/data_stopwords_misc.Rd +++ b/man/data_stopwords_misc.Rd @@ -4,14 +4,20 @@ \name{data_stopwords_misc} \alias{data_stopwords_misc} \title{miscellaneous stopword lists} -\format{An object of class \code{list} of length 4.} +\format{An object of class \code{list} of length 5.} \source{ -The Arabic stopwords come from \url{https://sites.google.com/site/kevinbouge/stopwords-lists}. +The Arabic stopwords come from +\url{https://sites.google.com/site/kevinbouge/stopwords-lists}. -The Catalan stopwords come from \url{http://latel.upf.edu/morgana/altres/pub/ca_stop.htm}. +The Catalan stopwords come from +\url{http://latel.upf.edu/morgana/altres/pub/ca_stop.htm}. The Greek stopwords were supplied by Carsten Schwemmer (see -\url{https://github.com/kbenoit/quanteda/issues/282}). +\url{https://github.com/quanteda/quanteda/issues/282}). + +The Gujarati stopwords are taken from +\url{https://github.com/gujarati-ir/Gujarati-Stop-Words} and modified by +Chandrakant Bhogayata. The Chinese stopwords are taken from the \href{http://www.baiduguide.com/baidu-stopwords/}{Baidu stopword list}.