In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import glob
import pandas as pd

# View Wikipedia Data

### Load data

In [3]:
def get_wiki_urls(data_dir):
    url_list = glob.glob(os.path.join(data_dir, '*'))
    return dict(zip(range(len(url_list)), url_list))

def get_wikitext(df, by):
    df = df[df['tag']=='p']
    return list(map(lambda x: ' '.join(x[1]['data']), df.groupby(by)))

In [4]:
wiki_urls = get_wiki_urls('data/wiki')

In [5]:
wiki_urls

{0: 'data/wiki/Instrumentalism.csv',
 1: 'data/wiki/Baal.csv',
 2: 'data/wiki/Agnosticism.csv',
 3: 'data/wiki/Fly.csv'}

In [6]:
wiki = pd.read_csv(wiki_urls[0])

In [7]:
wiki.head(5)

Unnamed: 0,nword,data,tag,section,subsection,paragraph
0,1,Instrumentalism,h1,0,0,0
1,36,"In philosophy of science and in epistemology, ...",p,0,0,1
2,21,"According to instrumentalists, a successful sc...",p,0,0,2
3,40,Scientific theory is merely a tool whereby hum...,p,0,0,2
4,11,Instrumentalism is a perspective originally in...,p,0,0,2


### Text by paragraph

In [8]:
get_wikitext(wiki, 'paragraph')[:3]

['In philosophy of science and in epistemology, instrumentalism is a methodological view that ideas are useful instruments, and that the worth of an idea is based on how effective it is in explaining and predicting phenomena.',
 "According to instrumentalists, a successful scientific theory reveals nothing known either true or false about nature's unobservable objects, properties or processes. Scientific theory is merely a tool whereby humans predict observations in a particular domain of nature by formulating laws, which state or summarize regularities, while theories themselves do not reveal supposedly hidden aspects of nature that somehow explain these laws. Instrumentalism is a perspective originally introduced by Pierre Duhem in 1906.",
 "Rejecting scientific realism's ambitions to uncover metaphysical truth about nature, instrumentalism is usually categorized as an antirealism, although its mere lack of commitment to scientific theory's realism can be termed nonrealism. Instrumen

### Text by section

In [9]:
get_wikitext(wiki, 'subsection')[:3]

["In philosophy of science and in epistemology, instrumentalism is a methodological view that ideas are useful instruments, and that the worth of an idea is based on how effective it is in explaining and predicting phenomena. According to instrumentalists, a successful scientific theory reveals nothing known either true or false about nature's unobservable objects, properties or processes. Scientific theory is merely a tool whereby humans predict observations in a particular domain of nature by formulating laws, which state or summarize regularities, while theories themselves do not reveal supposedly hidden aspects of nature that somehow explain these laws. Instrumentalism is a perspective originally introduced by Pierre Duhem in 1906. Rejecting scientific realism's ambitions to uncover metaphysical truth about nature, instrumentalism is usually categorized as an antirealism, although its mere lack of commitment to scientific theory's realism can be termed nonrealism. Instrumentalism m

# View TM Data

### TM data by lines

In [10]:
tm_lines = pd.read_csv('data/tm/rib/lines.tsv', sep='\t').fillna('')

In [11]:
tm_lines.head(5)

Unnamed: 0,prefix,c0,content,size,face,marker,line,page,x0,y0,...,spacing,chunk,group,type,section,subsection,subsubsection,title,section_tag,image_url
0,,TM 5-1940-328-10,TM 5-1940-328-10,24.0,1,-1,0,1,30.81896,9.038821,...,-99.0,0,0,0,0,0,0,,0,
1,,OPERATOR MANUAL,OPERATOR MANUAL,14.0,1,-1,1,1,35.302621,17.640046,...,5.2,1,0,0,0,0,0,START,0,
2,,FOR,FOR,14.0,1,-1,2,1,44.642815,20.821866,...,1.2,2,0,0,0,0,0,,0,
3,,RIGID INFLATABLE BOAT,RIGID INFLATABLE BOAT (RIB),18.0,1,-1,3,1,25.986118,24.010614,...,1.2,3,1,1,0,0,0,,0,
4,,P/N NSW8MTR-OPEN-001,P/N NSW8MTR-OPEN-001,18.0,1,-1,4,1,30.310952,26.737884,...,0.2,3,1,1,0,0,0,,0,


### TM data by "chunks"

In [12]:
tm_chunks = pd.read_csv('data/tm/rib/chunks.tsv', sep='\t').fillna('')

In [13]:
tm_chunks.head(5)

Unnamed: 0,prefix,content,marker,group,section,subsection,subsubsection,title,section_tag,image_url,context_mask,chunk_summary,group_summary,section_summary
0,,TM 5-1940-328-10,-1,0,0,0,0,,0,,1,TM 5-1940-328-10,,COPYRIGHT RELEASE - the contractor has obtaine...
1,,OPERATOR MANUAL,-1,0,0,0,0,START,0,,0,OPERATOR MANUAL,,
2,,FOR,-1,0,0,0,0,,0,,1,FOR,,
3,,RIGID INFLATABLE BOAT (RIB) P/N NSW8MTR-OPEN-0...,-1,1,0,0,0,,0,,1,RIGID INFLATABLE BOAT (RIB) P/N NSW8MTR-OPEN-0...,,
4,,DISTRIBUTION STATEMENT C - Distribution author...,-1,2,0,0,0,,0,,0,DISTRIBUTION STATEMENT C - Distribution author...,DISTRIBUTION STATEMENT C - Distribution author...,
