In [None]:
#| hide
import os

In [None]:
#| hide
source_path = f'{os.environ.get("HOME")}/data/'  
save_path = f'{os.environ.get("HOME")}/data/conc-test-corpora/' 

# Quick Conc Recipes

> Below are code snippets for common tasks in Conc.
- toc: false
- page-layout: full

The [Get started with Conc](https://geoffford.nz/conc/tutorials/start.html) tutorial (in progress) is a detailed step through Conc functionality. This page provides simple code recipes for common Conc tasks. See the [Conc API Reference](https://geoffford.nz/conc/api) for information on available methods, functions and parameters. 

::: {.callout-note collapse="true"}
##### Minimal import to use Conc functionality

In [None]:
#| eval: false
from conc.corpus import Corpus # for building/saving, loading, and working with a corpus
from conc.conc import Conc # for running reporting on your corpus 

:::

::: {.callout-note collapse="true"}
##### Build a Conc corpus from text files (or a compressed archive of text files)

In [None]:
#| eval: false
name = 'Garden Party Corpus'
description = 'A corpus of short stories from The Garden Party: and Other Stories by Katherine Mansfield. Texts downloaded from Project Gutenberg https://gutenberg.org/ and are in the public domain. The text files contain the short story without the title. https://github.com/ucdh/scraping-garden-party'
source_file = 'garden-party-corpus.zip'

corpus = Corpus(name=name, description=description).build_from_files(source_path = f'{source_path}{source_file}', save_path = save_path)
corpus.summary()

Corpus Summary,Corpus Summary
Attribute,Value
Name,Garden Party Corpus
Description,A corpus of short stories from The Garden Party: and Other Stories by Katherine Mansfield. Texts downloaded from Project Gutenberg https://gutenberg.org/ and are in the public domain. The text files contain the short story without the title. https://github.com/ucdh/scraping-garden-party
Date Created,2025-06-14 23:14:31
Conc Version,0.1.3
Corpus Path,/home/geoff/data/conc-test-corpora//garden-party.corpus
Document Count,15
Token Count,74664
Word Token Count,63311
Unique Tokens,5410
Unique Word Tokens,5398


:::

::: {.callout-note collapse="true"}
##### Load a Conc corpus

In [None]:
#| eval: false
corpus = Corpus().load(corpus_path=f'{save_path}garden-party.corpus')

:::

::: {.callout-note collapse="true"}
##### Prepare to report on your corpus

In [None]:
#| eval: false
conc = Conc(corpus=corpus)

:::

::: {.callout-note collapse="true"}
##### Frequency report

In [None]:
#| eval: false
conc.frequencies().display()

Frequencies,Frequencies,Frequencies,Frequencies
"Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus","Frequencies of word tokens, Garden Party Corpus"
Rank,Token,Frequency,Normalized Frequency
1,the,2911,459.79
2,and,1798,283.99
3,“,1615,255.09
4,”,1614,254.93
5,a,1407,222.24
6,to,1376,217.34
7,she,1171,184.96
8,was,1102,174.06
9,it,1021,161.27
10,her,937,148.00


:::

::: {.callout-note collapse="true"}
##### Ngram frequencies report

In [None]:
#| eval: false
conc.ngram_frequencies(ngram_length = 2).display()

Ngram Frequencies,Ngram Frequencies,Ngram Frequencies,Ngram Frequencies
Garden Party Corpus,Garden Party Corpus,Garden Party Corpus,Garden Party Corpus
Rank,Ngram,Frequency,Normalized Frequency
1,” said,328,51.81
2,” “,326,51.49
3,it was,247,39.01
4,in the,214,33.80
5,“ i,197,31.12
6,on the,183,28.90
7,of the,156,24.64
8,” she,156,24.64
9,to the,139,21.96
10,at the,133,21.01


:::

::: {.callout-note collapse="true"}
##### Keywords report

In [None]:
#| eval: false
reference_corpus = Corpus().load(f'{save_path}brown.corpus')
conc.set_reference_corpus(reference_corpus)
conc.keywords(min_document_frequency = 5, min_document_frequency_reference = 5).display()

Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords,Keywords
"Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus","Target corpus: Garden Party Corpus, Reference corpus: Brown Corpus"
Rank,Token,Frequency,Frequency Reference,Normalized Frequency,Normalized Frequency Reference,Relative Risk,Log Ratio,Log Likelihood
1,bye,25,7,3.95,0.07,55.29,5.79,107.37
2,velvet,14,5,2.21,0.05,43.35,5.44,57.19
3,shone,13,5,2.05,0.05,40.25,5.33,52.21
4,queer,15,6,2.37,0.06,38.70,5.27,59.69
5,gloves,17,7,2.69,0.07,37.60,5.23,67.18
6,cried,59,26,9.32,0.27,35.13,5.13,229.24
7,faintly,14,7,2.21,0.07,30.96,4.95,52.61
8,darling,36,18,5.69,0.18,30.96,4.95,135.27
9,oh,149,93,23.53,0.95,24.80,4.63,524.30
10,handkerchief,14,9,2.21,0.09,24.08,4.59,48.80


:::

::: {.callout-note collapse="true"}
##### Collocates report

In [None]:
#| eval: false
conc.collocates('could').display()

"Collocates of ""could""","Collocates of ""could""","Collocates of ""could""","Collocates of ""could""","Collocates of ""could""","Collocates of ""could"""
Garden Party Corpus,Garden Party Corpus,Garden Party Corpus,Garden Party Corpus,Garden Party Corpus,Garden Party Corpus
Rank,Token,Collocate Frequency,Frequency,Logdice,Log Likelihood
1,n’t,108,522,12.25,233.49
2,have,33,240,11.24,47.15
3,she,94,1171,11.13,59.84
4,he,55,718,10.93,31.84
5,they,35,398,10.89,26.42
6,it,70,1021,10.87,31.56
7,be,26,251,10.86,25.59
8,help,12,27,10.71,45.92
9,what,23,270,10.63,16.39
10,not,18,229,10.40,10.99


:::

::: {.callout-note collapse="true"}
##### Concordance

In [None]:
#| eval: false
conc.concordance('could', order = '1R2R3R', context_length = 10).display()

"Concordance for ""could""","Concordance for ""could""","Concordance for ""could""","Concordance for ""could"""
"Garden Party Corpus, Context tokens: 10, Order: 1R2R3R","Garden Party Corpus, Context tokens: 10, Order: 1R2R3R","Garden Party Corpus, Context tokens: 10, Order: 1R2R3R","Garden Party Corpus, Context tokens: 10, Order: 1R2R3R"
Document Id,Left,Node,Right
4,herself and got as close to the sea as she,could,", and sung something , something she had made up"
11,I sat up and called out as loud as I,could,", “ _ I do want to go on a"
13,’ll not be a minute . ” And before he,could,answer she was gone . He had half a mind
4,until I ’ve had something . Do you think we,could,ask Kate for two cups of hot water ? ”
2,", and there will be no time to explain what",could,be explained so simply .... But to - night it
1,Here ’s this huge house and garden . Surely you,could,be happy in — in — appreciating it for a
12,"away from the Listening Ear . Good Heavens , what",could,be more tragic than that lament ! Every note was
1,"even a successful , established , big paying concern —",could,be played with . A man had either to put
8,play . It was exactly like a play . Who,could,believe the sky at the back was n’t painted ?
13,. Was her luggage ready ? In that case they,could,cut off sharp with her cabin luggage and let the


:::

::: {.callout-note collapse="true"}
##### Concordance plot

In [None]:
#| eval: false
conc.concordance_plot('could')

:::

::: {.callout-note collapse="true"}
##### Work with Conc report data using other Python libraries (e.g. Pandas)

You are not restricted to Conc for your analysis. Conc report data can be exported to other formats. For example, although Conc uses [Polars](https://pola.rs/) internally for its efficiency, you can convert report results into a [Pandas](https://pandas.pydata.org/) dataframe, which is flexible and interoperable with many Python libraries for data analysis. Here is an example ...

In [None]:
#| eval: false
import pandas as pd # Conc does not require Pandas - so you will need to install it with "pip install pandas"

In [None]:
#| eval: false
# same specific tokens to restrict the results below to
pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'mine', 'yours', 
            'his', 'hers', 'its', 'ours', 'yours', 'theirs']

# retrieve keyword report for these pronouns, which has normalized frequencies for each corpus
# the next line returns the result as a Polars dataframe, which is then converted to a Pandas dataframe
df = conc.keywords(restrict_tokens = pronouns).to_frame().to_pandas() # page_size = 0 returns all

# now go for it - do the pandas stuff you are familiar with ...
# e.g. show a table of results ordered by frequency descending
df.sort_values(by='frequency', ascending=False)[['token', 'normalized_frequency', 'normalized_frequency_reference']]

Unnamed: 0,token,normalized_frequency,normalized_frequency_reference
0,she,184.95996,21.01732
7,it,161.267394,71.326254
2,her,147.999558,29.454856
6,i,113.566363,44.585285
12,he,113.408412,69.091889
5,you,101.404179,33.311432
15,his,69.498191,66.551446
9,they,62.864273,28.995739
10,them,33.169591,18.232015
14,him,30.168533,26.720564


:::