# Corpus: Chadwyck-Healey poetry collections

## Loading corpus from source

In [None]:
import sys
sys.path.append('../')
from generative_formalism import *

# Get the Chadwyck-Healey corpus path
print(f"""{"✓" if PATH_CHADWYCK_HEALEY_TXT and os.path.exists(PATH_CHADWYCK_HEALEY_TXT) else "X"} Chadwyck-Healey corpus path: {PATH_CHADWYCK_HEALEY_TXT}""")
print(f"""{"✓" if PATH_CHADWYCK_HEALEY_METADATA and os.path.exists(PATH_CHADWYCK_HEALEY_METADATA) else "X"} Chadwyck-Healey metadata path: {PATH_CHADWYCK_HEALEY_METADATA}""")

# Download if necessary?
print(f"""{"✓" if URL_CHADWYCK_HEALEY_METADATA and URL_CHADWYCK_HEALEY_METADATA else "X"} Metadata file URL set in environment (.env or shell)""")
print(f"""{"✓" if URL_CHADWYCK_HEALEY_TXT and URL_CHADWYCK_HEALEY_TXT else "X"} Corpus text file URL set in environment (.env or shell)""")

In [None]:
printm(f'### Loading corpus metadata')
df_meta = get_chadwyck_corpus_metadata()
describe_corpus(df_meta)

In [None]:
printm(f'### Loading corpus text files')
df_corpus = get_chadwyck_corpus()
df_corpus

## Sampling corpus by period

In [None]:
printm(f'### Loading period sample in paper')
df_smpl_by_period_in_paper = get_chadwyck_corpus_sampled_by_period_as_in_paper()
assert len(df_smpl_by_period_in_paper) == 8000
describe_corpus(df_smpl_by_period_in_paper)

In [9]:
printm(f'### Replicating period sample')
df_smpl_by_period_replicated = get_chadwyck_corpus_sampled_by_period_as_replicated()
assert len(df_smpl_by_period_replicated) == 8000
describe_corpus(df_smpl_by_period_replicated)

### Replicating period sample

* Loading period sample from /Users/rj416/github/generative-formalism/data/corpus_sample_by_period.replicated.csv.gz


----

#### Subcorpus breakdown

subcorpus
English Poetry              5549
American Poetry             1797
Modern Poetry                290
African-American Poetry      219
The Faber Poetry Library     145
Name: count, dtype: int64



----

#### Historical period breakdown (from metadata)

period_meta
1900-1999 Twentieth-Century                    2453
1700-1749 Early Eighteenth-Century              998
1800-1834 Early Nineteenth-Century              735
1835-1869 Mid Nineteenth-Century                666
1603-1660 Jacobean and Caroline                 658
1750-1799 Later Eighteenth-Century              616
1870-1899 Later Nineteenth-Century              565
1660-1700 Restoration                           499
1550-1900 Miscellanies and Collections           69
1500-1700 Emblems, Epigrams, Formal Satires      50
1500-1580 Tudor                                  20
1880-1901 Late Victorian                          4
1860-1880 Mid-Victorian                           2
Name: count, dtype: int64



----

#### Historical period breakdown (from author birth year)

period
1900-1950    1000
1800-1850    1000
1950-2000    1000
1750-1800    1000
1850-1900    1000
1600-1650    1000
1700-1750    1000
1650-1700    1000
Name: count, dtype: int64



----

#### Historical period + subcorpus breakdown

                                    count
period    subcorpus                      
1600-1650 American Poetry              44
          English Poetry              956
1650-1700 American Poetry              14
          English Poetry              986
1700-1750 American Poetry              20
          English Poetry              980
1750-1800 African-American Poetry       5
          American Poetry             192
          English Poetry              803
1800-1850 African-American Poetry       6
          American Poetry             331
          English Poetry              663
1850-1900 African-American Poetry      53
          American Poetry             304
          English Poetry              597
          Modern Poetry                24
          The Faber Poetry Library     22
1900-1950 African-American Poetry      46
          American Poetry             574
          English Poetry              208
          Modern Poetry               135
          The Faber Poetry Library

----

#### Author birth year distribution

author_dob
1600 ------- [ 1699   | 1799 |   1899 ] -------- 1974



----

#### Number of lines in poems

num_lines
10 ------- [ 16   | 26 |   42 ] -------- 100



----

#### Annotated rhyme distribution

rhyme
y      5534
n       102
y n       4
Name: count, dtype: int64



----

#### Metadata

Unnamed: 0_level_0,id,period_meta,subcorpus,author,author_dob,title,year,num_lines,volume,line,rhyme,genre,period,txt
id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,c20-american/am30024/Z300356635,1900-1999 Twentieth-Century,American Poetry,"Clark, Tom, 1941-",1941.0,Burst Phase,1971,12,,I dreamed I was in a green telephone booth,,,1900-1950,I dreamed I was in a green telephone booth\nre...
1,american/am1041/Z200182991,1835-1869 Mid Nineteenth-Century,American Poetry,"Embury, Emma C. (Emma Catherine), 1806-1863",1806.0,"SONNET TO WILLIAM CULLEN BRYANT, WRITTEN IMME...",1836,14,The poems of Mrs. Emma Catherine Embury (1869),"&indent;I owe an hour of intellectual life,",y,Sonnet,1800-1850,"My thanks are thine, most gifted one; to thee\..."
3,english/myersern/Z300449085,1870-1899 Later Nineteenth-Century,English Poetry,"Myers, Ernest, 1844-1921",1844.0,ITALIA UNA,1874,28,Gathered Poems (1904),"And fit for winter flame,",y,,1800-1850,"""What though the branch be broken\nAnd fit for..."
7,c20-english/fa21301/Z200610201,1900-1999 Twentieth-Century,English Poetry,"Maxwell, Glyn, 1962-",1962.0,CAP D'AIL,1992,100,,The chap on the next promontory began,,,1950-2000,The chap on the next promontory began\nand fin...
9,english/bowlesca/Z200285205,1800-1834 Early Nineteenth-Century,English Poetry,"Southey, Caroline Bowles, 1786-1854",1786.0,SONNET.—1818. [Autumnal leaves and flowerets! ...,1816,14,The Poetical Works (1867),&indent;Pale sickly children of the waning year!,y,Sonnet,1750-1800,Autumnal leaves and flowerets! lingering last ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133723,c20-african-american/da20060/Z200329283,1900-1999 Twentieth-Century,African-American Poetry,"McClane, Kenneth A., 1951-",1951.0,The Music of Hunger,1981,22,,How delicately the blossoms fall,,,1950-2000,How delicately the blossoms fall\nOver the fen...
133789,english/somervil/Z200490130,1700-1749 Early Eighteenth-Century,English Poetry,"Somervile, William, 1675-1742",1675.0,From Martial. Epig. 47.,1705,37,Occasional Poems [etc.] (1727),"To live at Ease, and stem the Tide of Fate;",y,,1650-1700,Wou'd you (my Friend) find out the true Receip...
133982,english/packrich/Z200454940,1700-1749 Early Eighteenth-Century,English Poetry,"Pack, Richardson, 1682-1728",1682.0,"STANZAS, OCCASIONED By what happened at Aberd...",1712,12,Poetical Remains (1738),"&indent;When the dear Cause of all his Smart,",y,,1650-1700,How oddly is a Lover fated!\n When the dear...
133988,c20-african-american/da20060/Z200329253,1900-1999 Twentieth-Century,African-American Poetry,"McClane, Kenneth A., 1951-",1951.0,At November's Turn,1981,32,,Now I am worrying,,,1950-2000,Now I am worrying\nthat the last maple might\n...


## Sampling corpus by rhyme

In [10]:
printm(f'### Loading rhyme sample in paper')
df_smpl_by_rhyme_in_paper = get_chadwyck_corpus_sampled_by_rhyme_as_in_paper()
assert len(df_smpl_by_rhyme_in_paper) == 2000
describe_corpus(df_smpl_by_rhyme_in_paper)

### Loading rhyme sample in paper

----

#### Subcorpus breakdown

subcorpus
English Poetry             1401
American Poetry             582
African-American Poetry      17
Name: count, dtype: int64



----

#### Historical period breakdown (from metadata)

period_meta
1835-1869 Mid Nineteenth-Century               636
1870-1899 Later Nineteenth-Century             371
1800-1834 Early Nineteenth-Century             298
1750-1799 Later Eighteenth-Century              94
1700-1749 Early Eighteenth-Century              88
1603-1660 Jacobean and Caroline                 41
1660-1700 Restoration                           37
1900-1999 Twentieth-Century                     36
1550-1900 Miscellanies and Collections          29
1500-1700 Emblems, Epigrams, Formal Satires      2
1880-1901 Late Victorian                         1
1500-1580 Tudor                                  1
1860-1880 Mid-Victorian                          1
Name: count, dtype: int64



----

#### Historical period breakdown (from author birth year)

period
1800-1850    956
1750-1800    449
1850-1900    348
1700-1750    132
1600-1650     60
1650-1700     55
Name: count, dtype: int64



----

#### Historical period + subcorpus breakdown

                                   count
period    subcorpus                     
1600-1650 American Poetry              3
          English Poetry              57
1650-1700 English Poetry              55
1700-1750 American Poetry              3
          English Poetry             129
1750-1800 African-American Poetry      3
          American Poetry            126
          English Poetry             320
1800-1850 African-American Poetry      1
          American Poetry            371
          English Poetry             584
1850-1900 African-American Poetry     13
          American Poetry             79
          English Poetry             256



----

#### Author birth year distribution

author_dob
1600 ------- [ 1791   | 1813 |   1841 ] -------- 1891



----

#### Number of lines in poems

num_lines
10 ------- [ 18   | 30 |   48 ] -------- 100



----

#### Annotated rhyme distribution

rhyme
y    1000
n    1000
Name: count, dtype: int64



----

#### Metadata

Unnamed: 0_level_0,id,period_meta,subcorpus,author,author_dob,title,year,num_lines,volume,line,rhyme,genre,period,txt
id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,english-ed2/ep2438/Z300661875,,English Poetry,"Price, Herbert, b. 1858",1858.0,THE FORSAKEN GARDEN,1888,35,Poems and Sonnets by Herbert Price (1914),"In the garden we loved that is now a waste,",y,,1850-1900,"Ah! sweet were the days, and the nights and th..."
1,english/pennecu1/Z200459978,1660-1700 Restoration,English Poetry,"Pennecuik, Alexander, 1652-1722",1652.0,THE CITY AND COUNTRY MOUSE.,1682,50,The Works (1815),"&indent;Met with a city mouse, right smooth an...",y,,1650-1700,"A country mouse, upon a winter's day,\n Met..."
2,english/wattsisa/Z300523040,1750-1799 Later Eighteenth-Century,English Poetry,"Watts, Isaac, 1674-1748",1674.0,SONG 11. Heaven and Hell.,1704,16,The Works (1810),&indent;A heav'n of joy and love;,y,Lyric,1650-1700,There is beyond the sky\n A heaven of joy a...
3,english/hardytho/Z200137433,1870-1899 Later Nineteenth-Century,English Poetry,"Hardy, Thomas, 1840-1928",1840.0,WHEN DEAD,1870,16,,&indent;&indent;I am under the bough;,y,,1800-1850,It will be much better when\n I am unde...
4,english/fawkesfr/Z300372956,1750-1799 Later Eighteenth-Century,English Poetry,"Fawkes, Francis, 1720-1777",1720.0,"III. ON A WORTHY FRIEND, Who was accomplished...",1750,10,Original Poems and Translations (1761),"Thou friendly, candid, virtuous mind, farewel!",y,,1700-1750,"Oh born in liberal studies to excel,\nThou fri..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219124,english-ed2/ep2525/Z200668962,,English Poetry,"Armstrong, Edmund John, 1841-1865",1841.0,WOMAN'S SORROW.,1871,14,The Poetical Works of Edmund J. Armstrong. Edi...,"Tost by a tempest, and ere long in calm",n,,1800-1850,The sorrow of a man is Like the sea\nTost by a...
219130,english-ed2/ep2316/Z200654162,,English Poetry,"Collins, Mortimer, 1827-1876",1827.0,A CAVALIER BALLAD.,1857,37,Idyls and Rhymes. By Mortimer Collins (1855),"Who is gone, in his glory and his sorrow, to the",n,,1800-1850,O alas and alas for the King we could not save...
219174,english/colersam/Z300317124,1800-1834 Early Nineteenth-Century,English Poetry,"Coleridge, Samuel Taylor, 1772-1834",1772.0,TO THE REV. GEORGE COLERIDGE OF OTTERY ST. MA...,1802,77,The Complete Poetical Works (1912),Notus in fratres animi paterni.,n,,1750-1800,"A blessed lot hath he, who having passed\nHis ..."
219178,american/am1066/Z200187826,1835-1869 Mid Nineteenth-Century,American Poetry,"Whitman, Walt, 1819-1892",1819.0,KOSMOS.,1849,10,Leaves of grass (1860–61),"Who is the amplitude of the earth, and the coa...",n,,1800-1850,"Who includes diversity, and is Nature,\nWho is..."


In [11]:
printm(f'### Replicating rhyme sample')
df_smpl_by_rhyme_replicated = get_chadwyck_corpus_sampled_by_rhyme_as_replicated()
assert len(df_smpl_by_rhyme_replicated) == 2000
describe_corpus(df_smpl_by_rhyme_replicated)

### Replicating rhyme sample

* Loading rhyme sample from /Users/rj416/github/generative-formalism/data/corpus_sample_by_rhyme.replicated.csv.gz


----

#### Subcorpus breakdown

subcorpus
English Poetry             1349
American Poetry             635
African-American Poetry      16
Name: count, dtype: int64



----

#### Historical period breakdown (from metadata)

period_meta
1835-1869 Mid Nineteenth-Century               645
1870-1899 Later Nineteenth-Century             398
1800-1834 Early Nineteenth-Century             291
1750-1799 Later Eighteenth-Century              95
1700-1749 Early Eighteenth-Century              80
1603-1660 Jacobean and Caroline                 43
1900-1999 Twentieth-Century                     37
1660-1700 Restoration                           32
1550-1900 Miscellanies and Collections          25
1860-1880 Mid-Victorian                          4
1500-1700 Emblems, Epigrams, Formal Satires      3
1500-1580 Tudor                                  1
Name: count, dtype: int64



----

#### Historical period breakdown (from author birth year)

period
1800-1850    984
1750-1800    441
1850-1900    319
1700-1750    119
1600-1650     65
1650-1700     64
1900-1950      8
Name: count, dtype: int64



----

#### Historical period + subcorpus breakdown

                                   count
period    subcorpus                     
1600-1650 American Poetry              4
          English Poetry              61
1650-1700 English Poetry              64
1700-1750 American Poetry              2
          English Poetry             117
1750-1800 African-American Poetry      1
          American Poetry            127
          English Poetry             313
1800-1850 African-American Poetry      6
          American Poetry            432
          English Poetry             546
1850-1900 African-American Poetry      9
          American Poetry             70
          English Poetry             240
1900-1950 English Poetry               8



----

#### Author birth year distribution

author_dob
1600 ------- [ 1791   | 1814 |   1840 ] -------- 1923



----

#### Number of lines in poems

num_lines
10 ------- [ 16   | 28 |   48 ] -------- 100



----

#### Annotated rhyme distribution

rhyme
y    1000
n    1000
Name: count, dtype: int64



----

#### Metadata

Unnamed: 0_level_0,id,period_meta,subcorpus,author,author_dob,title,year,num_lines,volume,line,rhyme,genre,period,txt
id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
15,american/am1183/Z300192343,1900-1999 Twentieth-Century,American Poetry,"Robinson, Edwin Arlington, 1869-1935",1869.0,THREE QUATRAINS,1899,12,Collected poems (1937),&indent;Will poets mock it with crowned words ...,y,,1850-1900,As long as Fame's imperious music rings\n W...
16,american/am0613/Z200167632,1835-1869 Mid Nineteenth-Century,American Poetry,"Gallagher, William D. (William Davis), 1808-1894",1808.0,OUR EARLY DAYS.,1838,58,"[Poems, in] Selections from the poetical liter...","We turn on Life's bewildering track,",y,,1800-1850,Our EARLY DAYS! -- How often back\nWe turn on ...
16,english/callanan/Z300294777,1800-1834 Early Nineteenth-Century,English Poetry,"Callanan, Jeremiah Joseph, 1795-1829",1795.0,"“SI JE DE PERDS, JE SUIS PERDU.”",1825,40,The Poems (1861),Shine on thou bright beacon,y,,1750-1800,Shine on thou bright beacon\n Unclouded and...
19,english/wattsisa/Z400522946,1750-1799 Later Eighteenth-Century,English Poetry,"Watts, Isaac, 1674-1748",1674.0,HYMN 135. (L. M.) Types and Prophecies of Christ.,1704,16,The Works (1810),Behold the great Messiah come!,y,Lyric,1650-1700,Behold the woman's promised seed!\nBehold the ...
25,english/mooreedw/Z200444546,1700-1749 Early Eighteenth-Century,English Poetry,"Moore, Edward, 1712-1757",1712.0,SONG the Eighth.,1742,24,"Poems, Fables and Plays (1756)","I always have boasted, and seek not to hide;",y,,1700-1750,"That Jenny's my friend, my delight, and my pri..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233059,english/newmanfr/Z200449912,1835-1869 Mid Nineteenth-Century,English Poetry,"Newman, Francis William, 1805-1897",1805.0,Epilogus.,1835,33,Theism (1858),We praise thee in thy sanctity.,n,,1800-1850,"We praise thee in thy power, O God!\nWe praise..."
233439,american/am0338/Z200157116,1870-1899 Later Nineteenth-Century,American Poetry,"Wilcox, Ella Wheeler, 1850-1919",1850.0,THOUGHTS ON LEAVING JAPAN,1880,30,The worlds and I [1918],"A changing medley of insistent sounds,",n,,1850-1900,"A changing medley of insistent sounds,\nLike b..."
233475,english/landorwa/Z200413028,1800-1834 Early Nineteenth-Century,English Poetry,"Landor, Walter Savage, 1775-1864",1775.0,LORD DUDLEY STUART,1805,22,To Elizabeth Barrett Browning (1917),By the grave's coldness palsied is the hand,n,Elegy,1750-1800,By the grave's coldness palsied is the hand\nO...
233932,american/am1066/Z300185691,1835-1869 Mid Nineteenth-Century,American Poetry,"Whitman, Walt, 1819-1892",1819.0,RESPONDEZ!,1849,68,Leaves of grass (1872),(The war is completed—the price is paid—the ti...,n,,1800-1850,Respondez! Respondez!\n(The war is completed -...


## Sampling corpus by period/subcorpus

In [12]:
printm(f'### Loading period/subcorpus sample in paper')
df_smpl_by_period_subcorpus_in_paper = get_chadwyck_corpus_sampled_by_period_subcorpus_as_in_paper()
assert len(df_smpl_by_period_subcorpus_in_paper) > 8000
describe_corpus(df_smpl_by_period_subcorpus_in_paper)

### Loading period/subcorpus sample in paper

----

#### Subcorpus breakdown

subcorpus
English Poetry              8000
American Poetry             5775
African-American Poetry     3649
Modern Poetry               2810
The Faber Poetry Library    2484
Name: count, dtype: int64



----

#### Historical period breakdown (from metadata)

period_meta
1900-1999 Twentieth-Century                    12630
1835-1869 Mid Nineteenth-Century                2042
1870-1899 Later Nineteenth-Century              1962
1800-1834 Early Nineteenth-Century              1172
1700-1749 Early Eighteenth-Century              1117
1750-1799 Later Eighteenth-Century              1074
1603-1660 Jacobean and Caroline                  951
1660-1700 Restoration                            566
1550-1900 Miscellanies and Collections           158
1500-1700 Emblems, Epigrams, Formal Satires       51
1500-1580 Tudor                                   22
1880-1901 Late Victorian                           5
1837-1860 Early Victorian                          1
1860-1880 Mid-Victorian                            1
1500-1700 Songbooks                                1
Name: count, dtype: int64



----

#### Historical period breakdown (from author birth year)

period
1900-1950    5000
1850-1900    4672
1950-2000    4436
1800-1850    2544
1750-1800    2286
1600-1650    1361
1700-1750    1345
1650-1700    1074
Name: count, dtype: int64



----

#### Historical period + subcorpus breakdown

                                    count
period    subcorpus                      
1600-1650 American Poetry             361
          English Poetry             1000
1650-1700 American Poetry              74
          English Poetry             1000
1700-1750 African-American Poetry       3
          American Poetry             340
          English Poetry             1000
          The Faber Poetry Library      2
1750-1800 African-American Poetry     284
          American Poetry            1000
          English Poetry             1000
          The Faber Poetry Library      2
1800-1850 African-American Poetry     542
          American Poetry            1000
          English Poetry             1000
          Modern Poetry                 1
          The Faber Poetry Library      1
1850-1900 African-American Poetry    1000
          American Poetry            1000
          English Poetry             1000
          Modern Poetry               809
          The Faber Poetry Library

----

#### Author birth year distribution

author_dob
1600 ------- [ 1795   | 1886 |   1940 ] -------- 1974



----

#### Number of lines in poems

num_lines
10 ------- [ 16   | 25 |   40 ] -------- 100



----

#### Annotated rhyme distribution

rhyme
y      10336
n        253
y n        8
Name: count, dtype: int64



----

#### Metadata

Unnamed: 0_level_0,id,period_meta,subcorpus,author,author_dob,title,year,num_lines,volume,line,rhyme,genre,period,txt
id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,c20-english/ep20152/Z200586158,1900-1999 Twentieth-Century,English Poetry,"Rosenberg, Isaac, 1890-1918",1890.0,‘I KNOW YOU GOLDEN’,1920,12,,I know you golden,,,1850-1900,I know you golden\nAs summer and pale\nAs the ...
3,english/kerpeter/Z300410015,1660-1700 Restoration,English Poetry,"Ker, Patrick, fl. 1691",1691.0,On the Memory of a Married Maid.,1721,16,Flosculum Poeticum (1684),A Marrie'd&hyphen;Virgin to remain.,y,,1650-1700,"Within this Coffin here does lie,\nA Pattern o..."
7,american/am1258/Z200196105,1835-1869 Mid Nineteenth-Century,American Poetry,"Emerson, Ralph Waldo, 1803-1882",1803.0,SEPTEMBER,1833,16,Poems [1904],"&indent;Of a gusty Autumn day,",y,,1800-1850,In the turbulent beauty\n Of a gusty Autumn...
8,english/gilfilla/Z400379001,1800-1834 Early Nineteenth-Century,English Poetry,"Gilfillan, Robert, 1798-1850",1798.0,NORWEGIAN SMUGGLER'S SONG.,1828,36,Poems and Songs (1851),"&indent;The storm is loud and high,",y,,1750-1800,"Awake, you midnight mariners!\n The storm i..."
18,english/wattwill/Z300523577,1800-1834 Early Nineteenth-Century,English Poetry,"Watt, William, 1793-1859",1793.0,BAB AT THE BOWSTER.,1823,40,Poems and Songs (1860),Wi' touslet hair and drowsy een?,y,Ballad,1750-1800,"Lassie, whare were you yestreen,\nWi' touslet ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999109,faber/fa0401/Z300557474,1900-1999 Twentieth-Century,The Faber Poetry Library,"Boyle, Charles, 1951-",1951.0,(i) Underground,1981,18,,A woman sleeping on the underground:,,,1950-2000,"A woman sleeping on the underground:\nneat, As..."
999377,c20-african-american/da20076/Z300330273,1900-1999 Twentieth-Century,African-American Poetry,"Weaver, Michael S., 1951-",1951.0,Duke Ellington and His Mistress Make Love,1981,34,,I draw the sheets written with life,,,1950-2000,I draw the sheets written with life\naround me...
999379,african-american/hortonge/Z200399812,1835-1869 Mid Nineteenth-Century,African-American Poetry,"Horton, George Moses, 1798?-ca.1880",1798.0,THE POWERS OF LOVE.,1828,35,Naked Genius (1865),It lifts the poor man from his cell,y,,1750-1800,It lifts the poor man from his cell\n To fo...
999421,c20-african-american/da22011/Z300262800,1900-1999 Twentieth-Century,African-American Poetry,"Jackson, Angela, 1951-",1951.0,"george, after all, means farmer",1981,39,,he carried a tomato plant &,,,1950-2000,he carried a tomato plant &\nwatermelon\nacros...


In [13]:
printm(f'### Replicating period/subcorpus sample')
df_smpl_by_period_subcorpus_replicated = get_chadwyck_corpus_sampled_by_period_subcorpus_as_replicated()
assert len(df_smpl_by_period_subcorpus_replicated) > 8000
describe_corpus(df_smpl_by_period_subcorpus_replicated)

### Replicating period/subcorpus sample

* Generating period subcorpus sample


##### Loading Chadwyck-Healey corpus (metadata + txt)

* Loading corpus from memory


#### Sampling corpus by ['period', 'subcorpus'] (min 10, max 1000)

period     subcorpus               
1600-1650  American Poetry              361
           English Poetry              1000
1650-1700  American Poetry               74
           English Poetry              1000
1700-1750  American Poetry              340
           English Poetry              1000
1750-1800  African-American Poetry      284
           American Poetry             1000
           English Poetry              1000
1800-1850  African-American Poetry      542
           American Poetry             1000
           English Poetry              1000
1850-1900  African-American Poetry     1000
           American Poetry             1000
           English Poetry              1000
           Modern Poetry                809
           The Faber Poetry Library     863
1900-1950  African-American Poetry     1000
           American Poetry             1000
           English Poetry              1000
           Modern Poetry               1000
           The Faber Poetry Library    1

----

#### Subcorpus breakdown

subcorpus
English Poetry              8000
American Poetry             5775
African-American Poetry     3646
Modern Poetry               2809
The Faber Poetry Library    2479
Name: count, dtype: int64



----

#### Historical period breakdown (from metadata)

period_meta
1900-1999 Twentieth-Century                    12638
1835-1869 Mid Nineteenth-Century                2060
1870-1899 Later Nineteenth-Century              1953
1800-1834 Early Nineteenth-Century              1142
1750-1799 Later Eighteenth-Century              1118
1700-1749 Early Eighteenth-Century              1049
                                                 986
1603-1660 Jacobean and Caroline                  920
1660-1700 Restoration                            587
1550-1900 Miscellanies and Collections           180
1500-1700 Emblems, Epigrams, Formal Satires       58
1500-1580 Tudor                                   13
1860-1880 Mid-Victorian                            3
1880-1901 Late Victorian                           1
1500-1700 Songbooks                                1
Name: count, dtype: int64



----

#### Historical period breakdown (from author birth year)

period
1900-1950    5000
1850-1900    4672
1950-2000    4436
1800-1850    2542
1750-1800    2284
1600-1650    1361
1700-1750    1340
1650-1700    1074
Name: count, dtype: int64



----

#### Historical period + subcorpus breakdown

                                    count
period    subcorpus                      
1600-1650 American Poetry             361
          English Poetry             1000
1650-1700 American Poetry              74
          English Poetry             1000
1700-1750 American Poetry             340
          English Poetry             1000
1750-1800 African-American Poetry     284
          American Poetry            1000
          English Poetry             1000
1800-1850 African-American Poetry     542
          American Poetry            1000
          English Poetry             1000
1850-1900 African-American Poetry    1000
          American Poetry            1000
          English Poetry             1000
          Modern Poetry               809
          The Faber Poetry Library    863
1900-1950 African-American Poetry    1000
          American Poetry            1000
          English Poetry             1000
          Modern Poetry              1000
          The Faber Poetry Library

----

#### Author birth year distribution

author_dob
1600 ------- [ 1795   | 1886 |   1940 ] -------- 1974



----

#### Number of lines in poems

num_lines
10 ------- [ 16   | 25 |   40 ] -------- 100



----

#### Annotated rhyme distribution

rhyme
       12128
y      10274
n        291
y n       16
Name: count, dtype: int64



----

#### Metadata

Unnamed: 0_level_0,id,period_meta,subcorpus,author,author_dob,title,year,num_lines,volume,line,rhyme,genre,period,txt
id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6,c20-english/ep20106/Z200598332,1900-1999 Twentieth-Century,English Poetry,"Ní Chuilleanáin, Eiléan, 1942–",1942.0,Survivors,1972,51,,Where the loose wheel swings at the stern,,,1900-1950,Where the lose wheel swings at the stern\nOf N...
7,c20-english/ep20151/Z200600821,1900-1999 Twentieth-Century,English Poetry,"Thomas, R. S. (Ronald Stuart), 1913-",1913.0,Newts,1943,20,,In a pool,,,1900-1950,"In a pool\non the mountain\nnewts live, semi-\..."
15,american/am1183/Z300192343,1900-1999 Twentieth-Century,American Poetry,"Robinson, Edwin Arlington, 1869-1935",1869.0,THREE QUATRAINS,1899,12,Collected poems (1937),&indent;Will poets mock it with crowned words ...,y,,1850-1900,As long as Fame's imperious music rings\n W...
16,american/am0613/Z200167632,1835-1869 Mid Nineteenth-Century,American Poetry,"Gallagher, William D. (William Davis), 1808-1894",1808.0,OUR EARLY DAYS.,1838,58,"[Poems, in] Selections from the poetical liter...","We turn on Life's bewildering track,",y,,1800-1850,Our EARLY DAYS! -- How often back\nWe turn on ...
16,english/callanan/Z300294777,1800-1834 Early Nineteenth-Century,English Poetry,"Callanan, Jeremiah Joseph, 1795-1829",1795.0,"“SI JE DE PERDS, JE SUIS PERDU.”",1825,40,The Poems (1861),Shine on thou bright beacon,y,,1750-1800,Shine on thou bright beacon\n Unclouded and...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999566,faber/fa20301/Z300297127,1900-1999 Twentieth-Century,The Faber Poetry Library,"Joyce, James, 1882-1941",1882.0,44 Post Ulixem Scriptum (Air: Molly Brannigan),1912,25,,"Man dear, did you never hear of buxom Molly Bl...",,,1850-1900,"Man dear, did you never hear of buxom Molly Bl..."
999865,african-american/hortonge/Z200399915,1835-1869 Mid Nineteenth-Century,African-American Poetry,"Horton, George Moses, 1798?-ca.1880",1798.0,NEW FASHIONS.,1828,54,Naked Genius (1865),"There was a time when death was terror,",y,,1750-1800,"There was a time when death was terror,\n S..."
999874,faber/fa1201/Z300559934,1900-1999 Twentieth-Century,The Faber Poetry Library,"Hofmann, Michael, 1957-",1957.0,Fucking,1987,12,,"A zero sum game, our extravagant happiness,",,,1950-2000,"A zero sum game, our extravagant happiness,\nm..."
999878,american/am1218/Z200193650,1750-1799 Later Eighteenth-Century,American Poetry,"Hopkinson, Francis, 1737-1791",1737.0,"CHARITY, A POEM. Delivered by the author at a...",1767,74,The miscellaneous essays and occasional writin...,"Once more with joy, we swell th' advent'rous lay.",y,Ode,1700-1750,"To grace the pomp of this auspicious day,\nOnc..."


In [18]:
# Final tests for inequality between replicated and paper-originating samples
assert not df_smpl_by_period_in_paper.index.equals(df_smpl_by_period_replicated.index)
assert not df_smpl_by_period_subcorpus_in_paper.index.equals(df_smpl_by_period_subcorpus_replicated.index)
assert not df_smpl_by_rhyme_in_paper.index.equals(df_smpl_by_rhyme_replicated.index)