-
Notifications
You must be signed in to change notification settings - Fork 187
/
Copy pathchinese.Rmd
112 lines (83 loc) · 2.59 KB
/
chinese.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
title: "Example: Chinese text analysis"
author: "Haiyan Wang"
output:
html_document:
toc: true
---
```{r, echo = FALSE}
knitr::opts_chunk$set(
collapse = FALSE,
comment = "##"
)
```
```{r, message=FALSE, warnings=FALSE}
library("quanteda")
```
# Download corpus
Download corpus constructed from *Report on the Work of the Government* published by Premier of the State Council between 1954 and 2017. You can download the corpus using the **quanteda.corpora** package.
```{r, message=FALSE,eval=FALSE}
# read text files
remotes::install_github("quanteda/quanteda.corpora")
```
```{r}
library("quanteda.corpora")
corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")
```
# Tokenization
```{r}
# Chinese stopwords
ch_stop <- stopwords("zh", source = "misc")
# tokenize
toks_ch <- corp |>
tokens(remove_punct = TRUE) |>
tokens_remove(pattern = ch_stop)
# Construct a dfm
dfmat_ch <- dfm(toks_ch)
# Get most frequent features
topfeatures(dfmat_ch)
```
# Analysis
## Word cloud
```{r, fig.height = 5, fig.width = 5, fig.align = "center"}
# Plot a word cloud
set.seed(100)
# to set the font correctly for macOS
library("quanteda.textplots")
textplot_wordcloud(dfmat_ch, min_count = 500, random_order = FALSE,
rotation = .25, max_words = 100,
min_size = 0.5, max_size = 2.8,
font = if (Sys.info()['sysname'] == "Darwin") "SimHei" else NULL,
color = RColorBrewer::brewer.pal(8, "Dark2"))
```
## Feature co-occurrence matrix
```{r}
# fcm within the window size of 5
corp_ch17 <- corpus_subset(corp, Year == "2017")
toks_ch17 <- corp_ch17 |>
tokens(remove_punct = TRUE) |>
tokens_remove(ch_stop)
fcmat_ch <- corp_ch17 |>
tokens() |>
fcm(context = "window", window = 5)
```
## Unsupervised document scaling
In this example, we run a Wordfish model to show how to apply an unspervised document scaling method to Chinese texts.
```{r fig2, fig.height = 4, fig.width = 6, fig.align = "center"}
library("quanteda.textmodels")
tmod_wf <- textmodel_wordfish(dfmat_ch)
y <- 1954:2017
y <- y[y <= 1964 | y >= 1975]
y <- y[!y %in% c(1963, 1961, 1962, 1976, 1977)]
plot(y, tmod_wf$theta, xlab = "Year", ylab = "Position")
```
## Collocations
```{r}
# bigrams cross the whole dataset
library("quanteda.textstats")
tstat_col_ch <- textstat_collocations(toks_ch, size = 2, min_count = 20)
knitr::kable(head(tstat_col_ch, 10))
# bigrams in 2017 report
tstat_col_ch17 <- textstat_collocations(toks_ch17, size = 2)
knitr::kable(head(tstat_col_ch17, 10))
```