# MarkMark 

In [1]:
import lltk
import plotnine as p9

In [2]:
# load corpus
C=lltk.load('MarkMark')

In [3]:
# get some basic info
C.info()

[MarkMark]
id: markmark
desc: Mark Algee-Hewitt's and Mark McGurl's 20th Century Corpus
link: --
downloadable: metadata,freqs,txt


## Install

### From pre-compiled zips

Only metadata and 1-gram counts are made available via download.

In [4]:
C.download(parts=['metadata','freqs','txt'], force=False)  # change force to True to redownload

### Compile from sources

Compile metadata and 1-gram counts from ARTFL website. Does not work unless you have institutional access to ARTFL.

In [5]:
# C.compile(force=False)  # set force to True to overwrite existing meta/data

## Metadata

In [6]:
# C.fix_metadata()

In [7]:
C.metadata

Unnamed: 0,id,author,title,year,author_id,dob,dod,gender,genre,genre_confirmed,name_first,name_last,name_middle,name_title,nation,notes,num_words,ocr_accuracy,source,corpus
0,"Caldwell,_Erskine.Tobacco_Road","Caldwell, Erskine",Tobacco Road,1932,,1903,1987,M,Fiction,,Erskine,Caldwell,,,American,,69662,0.818840,,MarkMark
1,"Sinclair,_Upton.The_Jungle","Sinclair, Upton",The Jungle,1906,,1878,1968,M,Fiction,,Upton,Sinclair,Beall,,American,,156031,0.822785,,MarkMark
2,"Hemingway,_Ernest.In_Our_Time","Hemingway, Ernest Miller",In Our Time,1925,,1899,1961,M,Fiction,,Ernest,Hemingway,Miller,,American,,43243,0.812039,,MarkMark
3,"Hemingway,_Ernest.A_Farewell_to_Arms","Hemingway, Ernest Miller",A Farewell to Arms,1929,,1899,1961,M,Fiction,,Ernest,Hemingway,Miller,,American,,103460,0.746056,,MarkMark
4,"Hemingway,_Ernest.For_Whom_the_Bell_Tolls","Hemingway, Ernest Miller",For Whom the Bell Tolls,1940,,1899,1961,M,Fiction,,Ernest,Hemingway,Miller,,American,,191216,0.769099,,MarkMark
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,"Sukenick,_Ronald.Up","Sukenick, Ronald",Up,1968,,1932,2004,M,Fiction,,Ronald,Sukenick,,,American,,114213,0.777302,,MarkMark
351,"Bennett,_Arnold.The_Old_Wives'_Tale","Bennett, Enoch Arnold",The Old Wives' Tale,1908,,1867,1931,M,Fiction,,Enoch,Bennett,Arnold,,English,,222095,0.782219,,MarkMark
352,"Robbins,_Tom.Skinny_Legs_and_All","Robbins, Tom",Skinny Legs and All,1990,,1936,,M,Fiction,,Tom,Robbins,,,American,,179211,0.795755,,MarkMark
353,"Donleavy,_J.P..The_Ginger_Man","Donleavy, James Patrick",The Ginger Man,1955,,1926,,M,Fiction,,James,Donleavy,Patrick,,Irish American,,116047,0.780417,,MarkMark


In [8]:
# Min to max years
C.metadata.year.min(), C.metadata.year.max()

(1881, 2011)

In [9]:
# Distribution of texts by year
# C.plot_distro('year')

## Preprocess

### Document-Term Matrix (DTM)

In [10]:
# C.mfw(n=1000)

In [11]:
# Build a document term matrix with the top n words (defaults to 25000)
C.preprocess_dtm(num_proc=4)

Gathering frequencies [x4]: 100%|██████████| 355/355 [00:01<00:00, 179.74it/s]


In [12]:
# Load dtm with top n words (defaults to 25000)
C.dtm()

Unnamed: 0_level_0,the,and,to,of,a,he,i,in,was,it,...,distracts,gymnast,testifies,allegories,chancellors,anatomically,disseminating,tamales,personals,reddish-purple
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Caldwell,_Erskine.Tobacco_Road",3447,1896,2231,1112,1029,1234,966,877,936,954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Sinclair,_Upton.The_Jungle",8927,7275,4185,4358,4154,3321,607,2510,3055,2351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Hemingway,_Ernest.In_Our_Time",3118,1346,802,741,823,1054,425,663,652,624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Hemingway,_Ernest.A_Farewell_to_Arms",6198,3166,1899,1130,1821,912,3859,1343,1447,1573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Hemingway,_Ernest.For_Whom_the_Bell_Tolls",11260,6456,3551,3666,2792,3926,2927,2421,2287,3228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Sukenick,_Ronald.Up",4350,1968,2437,2172,2965,1395,3306,1430,963,1649,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Bennett,_Arnold.The_Old_Wives'_Tale",12775,6690,5605,6618,5162,2947,1927,3893,4397,2614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Robbins,_Tom.Skinny_Legs_and_All",8677,3713,3799,3909,3871,1341,1147,2637,2240,2090,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
"Donleavy,_J.P..The_Ginger_Man",5353,3763,2460,2309,2822,484,3784,1732,579,1441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
# !rm -r /home/ryan/lltk_data/corpora/artfl/data

In [14]:
C.dtm(tf=True)

Unnamed: 0_level_0,the,and,to,of,a,he,i,in,was,it,...,distracts,gymnast,testifies,allegories,chancellors,anatomically,disseminating,tamales,personals,reddish-purple
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Caldwell,_Erskine.Tobacco_Road",0.001234,0.001259,0.001738,0.000910,0.000886,0.001392,0.001183,0.001095,0.001300,0.001431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Sinclair,_Upton.The_Jungle",0.003197,0.004829,0.003259,0.003568,0.003577,0.003747,0.000743,0.003134,0.004242,0.003527,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Hemingway,_Ernest.In_Our_Time",0.001116,0.000894,0.000625,0.000607,0.000709,0.001189,0.000520,0.000828,0.000905,0.000936,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Hemingway,_Ernest.A_Farewell_to_Arms",0.002219,0.002102,0.001479,0.000925,0.001568,0.001029,0.004725,0.001677,0.002009,0.002360,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Hemingway,_Ernest.For_Whom_the_Bell_Tolls",0.004032,0.004286,0.002766,0.003002,0.002404,0.004430,0.003584,0.003023,0.003176,0.004843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Sukenick,_Ronald.Up",0.001558,0.001306,0.001898,0.001778,0.002553,0.001574,0.004048,0.001786,0.001337,0.002474,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Bennett,_Arnold.The_Old_Wives'_Tale",0.004574,0.004441,0.004365,0.005419,0.004445,0.003325,0.002359,0.004861,0.006106,0.003921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Robbins,_Tom.Skinny_Legs_and_All",0.003107,0.002465,0.002959,0.003201,0.003333,0.001513,0.001404,0.003293,0.003111,0.003135,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.125,0.166667
"Donleavy,_J.P..The_Ginger_Man",0.001917,0.002498,0.001916,0.001891,0.002430,0.000546,0.004633,0.002163,0.000804,0.002162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.000000


In [15]:
dtm_tfidf = C.dtm(tf=True)
dtm_tfidf

Unnamed: 0_level_0,the,and,to,of,a,he,i,in,was,it,...,distracts,gymnast,testifies,allegories,chancellors,anatomically,disseminating,tamales,personals,reddish-purple
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Caldwell,_Erskine.Tobacco_Road",0.001234,0.001259,0.001738,0.000910,0.000886,0.001392,0.001183,0.001095,0.001300,0.001431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Sinclair,_Upton.The_Jungle",0.003197,0.004829,0.003259,0.003568,0.003577,0.003747,0.000743,0.003134,0.004242,0.003527,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Hemingway,_Ernest.In_Our_Time",0.001116,0.000894,0.000625,0.000607,0.000709,0.001189,0.000520,0.000828,0.000905,0.000936,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Hemingway,_Ernest.A_Farewell_to_Arms",0.002219,0.002102,0.001479,0.000925,0.001568,0.001029,0.004725,0.001677,0.002009,0.002360,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Hemingway,_Ernest.For_Whom_the_Bell_Tolls",0.004032,0.004286,0.002766,0.003002,0.002404,0.004430,0.003584,0.003023,0.003176,0.004843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Sukenick,_Ronald.Up",0.001558,0.001306,0.001898,0.001778,0.002553,0.001574,0.004048,0.001786,0.001337,0.002474,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Bennett,_Arnold.The_Old_Wives'_Tale",0.004574,0.004441,0.004365,0.005419,0.004445,0.003325,0.002359,0.004861,0.006106,0.003921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000000
"Robbins,_Tom.Skinny_Legs_and_All",0.003107,0.002465,0.002959,0.003201,0.003333,0.001513,0.001404,0.003293,0.003111,0.003135,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.125,0.166667
"Donleavy,_J.P..The_Ginger_Man",0.001917,0.002498,0.001916,0.001891,0.002430,0.000546,0.004633,0.002163,0.000804,0.002162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.000000


In [16]:
dtm_tfidf.loc['Hemingway,_Ernest.A_Farewell_to_Arms'].sort_values(ascending=False).head(25)

barkley       0.735849
rinaldi       0.478049
gage          0.436364
ferguson      0.313433
catherine     0.196146
simmons       0.183099
sweet.        0.156863
lire          0.155340
milan         0.144330
rucksack      0.112903
cognac        0.108247
sim           0.108108
barman        0.107143
italians      0.096886
lake.         0.090909
austrian      0.089552
ambulances    0.086022
darling.      0.085714
dugout        0.078740
sergeants     0.076923
officer.      0.076923
attack.       0.075472
rowed         0.073913
mia           0.073171
probably.     0.073171
Name: Hemingway,_Ernest.A_Farewell_to_Arms, dtype: float64