In [5]:
import sys
sys.path.append('..')
from osp import *

In [6]:
df = get_corpus_metadata()

In [7]:
dehyphenate(nltk.sent_tokenize("Hello\n\n\nworld!")[0])

'Hello\n\n\nworld!'

In [8]:
id,txt = next(iter_corpus_txt())

In [9]:
print(txt)

It is one of the relatively few deliberate jokes in the corpus, and its occurrence here is not without significance.
Aristotle in these chapters is arguing against those who believe in the existence of the void, or vacuum, or empty space; he says, 'even if we consider it on its own merits the so-called vacuum will be found to be really vacuous.
He seems to refuse to take the hypothesis of the void at all seriously.
Jack Macintosh made very useful comments on that occasion.
That paper lay asleep in a drawer for several years until Kant's remarks on density (in the Anticipations of Perception) sent me back to it.
Andrew Lugg has read several approximations to this final version, and has given me careful and generous comments on them.
It seems to me more likely that the motives are reasons, reasons having to do with physics, and probably reasons lying deep in his physical system.
I expect the problem lies in our imaginative difficulty in entering his viewpoint from our own, in shedding th

In [10]:
def tokenize(txt):
    tokens = txt.lower().split()
    cleaned = []
    for t in tokens:
        cleaned_token = t.lstrip("".join([c for c in t if not c.isalpha()])).rstrip("".join([c for c in t if not c.isalpha()]))
        if cleaned_token:
            cleaned.append(cleaned_token)
    return cleaned

STASH_COUNTS = HashStash('osp_counts')
def count_tokens(id, force=False):
    if not force and id in STASH_COUNTS:
        return STASH_COUNTS[id]
    txt = get_corpus_txt(id)
    d = dict(Counter(tokenize(txt)))
    STASH_COUNTS[id] = d
    return d
    


In [11]:
word_df = pd.read_csv('../data/raw/worddb.byu.txt',sep='\t').set_index('word')
ok_words = set(word_df.index)
len(ok_words)

86402

In [12]:
for id in tqdm(df.index):
    count_tokens(id)

100%|██████████| 58126/58126 [00:24<00:00, 2351.91it/s]


In [13]:
# !pip install bounter

In [14]:
# from bounter import bounter
# b = bounter(size_mb=16384, need_iteration=False, log_counting=8)

In [15]:
allcounts = Counter()
philcounts = Counter()
litcounts = Counter()
for k,d in tqdm(STASH_COUNTS.items(), total=len(STASH_COUNTS)):
    d = {w:c for w,c in d.items() if w in ok_words}
    allcounts.update(d)
    if k.startswith('phil/'):
        philcounts.update(d)
    else:
        litcounts.update(d)


100%|██████████| 58126/58126 [00:40<00:00, 1422.25it/s]


In [16]:
word_df['count'] = word_df.index.map(allcounts)
word_df['count_phil'] = word_df.index.map(philcounts)
word_df['count_lit'] = word_df.index.map(litcounts)

sumcount = word_df['count'].sum()
sumphil = word_df['count_phil'].sum()
sumlit = word_df['count_lit'].sum()

word_df['fpm'] = word_df['count'] / sumcount * 1_000_000
word_df['fpm_phil'] = word_df['count_phil'] / sumphil * 1_000_000
word_df['fpm_lit'] = word_df['count_lit'] / sumlit * 1_000_000

word_df['fpm_diff'] = word_df['fpm_phil'] - word_df['fpm_lit']
word_df['fpm_div'] = word_df['fpm_phil'] / word_df['fpm_lit']

In [17]:
len(set(philcounts.keys()) & set(litcounts.keys()))

59796

In [29]:
word_df['pos0'] = [pos[0] for pos in word_df['pos']]
word_df['content_word'] = [pos[0] in {'n','v','j','r'} for pos in word_df['pos0']]

In [30]:
word_df.query('fpm_phil > 1 & fpm_lit > 1 & content_word==False')[['fpm','fpm_phil','fpm_lit','fpm_diff','fpm_div','pos']].sort_values('fpm_div',ascending=True).head(25)

Unnamed: 0_level_0,fpm,fpm_phil,fpm_lit,fpm_diff,fpm_div,pos
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
thy,46.199326,3.65114,127.796152,-124.145013,0.02857,appge
thee,28.442756,2.438765,78.311934,-75.873169,0.031142,pp
ye,24.49071,2.424749,66.807727,-64.382978,0.036294,pp
twelfth,10.405212,1.415605,27.64504,-26.229435,0.051206,md
fifteenth,8.613434,1.303478,22.632109,-21.328631,0.057594,md
sixteenth,16.918258,2.634988,44.310013,-41.675026,0.059467,md
thou,51.699027,8.402527,134.730931,-126.328405,0.062365,pp
ya,5.527337,1.177335,13.869558,-12.692223,0.084886,pp
eighteen,4.900906,1.205367,11.988029,-10.782662,0.100548,mc
fourteenth,8.848346,2.403725,21.207522,-18.803797,0.113343,md


In [44]:
n_features = 100
word_df['fpm_div_log'] = word_df['fpm_div'].apply(np.log10)
word_df['fpm_div_log_abs'] = word_df['fpm_div_log'].apply(abs)
word_df['fpm_diff_abs'] = word_df['fpm_diff'].apply(abs)

ok_pos = {'n','v','j','r'}
feature_selection = word_df.sample(frac=1)

feature_selection = feature_selection.query('pos0 in @ok_pos')
feature_selection = feature_selection.sort_values('fpm_diff_abs', ascending=False).iloc[:n_features]
feature_selection.iloc[0]

WORDS_TO_FEATURE = feature_selection.index.tolist()
WORDS_TO_FEATURE = [w for w in WORDS_TO_FEATURE if len(w)>3]
WORDS_TO_FEATURE


['theory',
 'will',
 'would',
 'case',
 'does',
 'true',
 'then',
 'literary',
 'poem',
 'knowledge',
 'being',
 'belief',
 'view',
 'were',
 'argument',
 'been',
 'have',
 'text',
 'moral',
 'properties',
 'english',
 'reason',
 'truth',
 'work',
 'cases',
 'possible',
 'account',
 'different',
 'problem',
 'poetry',
 'question',
 'love',
 'objects',
 'given',
 'literature',
 'sense',
 'principle',
 'beliefs',
 'certain',
 'must',
 'causal',
 'fact',
 'claim',
 'object',
 'history',
 'should',
 'science',
 'novel',
 'story',
 'narrative',
 'life',
 'philosophy',
 'play',
 'poet',
 'epistemic',
 'might',
 'logical',
 'most',
 'conditions',
 'think',
 'press',
 'relevant',
 'writing',
 'things',
 'theories',
 'reasons',
 'only',
 'proposition',
 'death',
 'other',
 'particular',
 'book',
 'logic',
 'physical',
 'just',
 'concept',
 'relation',
 'property',
 'system',
 'reading',
 'poems',
 'experience',
 'example',
 'value',
 'works',
 'suppose']

In [46]:
word_df.loc[['hence','thus','therefore', 'because']][['count','pos']]

Unnamed: 0_level_0,count,pos
word,Unnamed: 1_level_1,Unnamed: 2_level_1
hence,66887,rr
thus,224437,rr
therefore,113948,rr
because,269776,cs


In [50]:
word_df

Unnamed: 0_level_0,US_or_UK,fpm_BNC,fpm_COCA,fpm_COHA_1800s,fpm_COHA_1900-49,fpm_COHA_1950-89,fpm_SOAP,fpm_bnc_acad,fpm_bnc_fic,fpm_bnc_mag,...,fpm,fpm_phil,fpm_lit,fpm_diff,fpm_div,content_word,fpm_div_log,fpm_div_log_abs,fpm_diff_abs,pos0
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,,59717.97,54124.71,65266.92,63479.96,59363.87,21403.42,73581.36,52467.51,58832.77,...,70157.163798,65449.558448,79185.178506,-13735.620059,0.826538,False,-0.082737,0.082737,13735.620059,a
and,,25808.34,26636.86,33417.48,28577.44,26260.07,17677.41,26888.33,26803.23,26337.68,...,24934.743682,22059.512988,30448.720124,-8389.207136,0.724481,False,-0.139973,0.139973,8389.207136,c
of,,30086.53,25782.79,37182.86,32184.17,27505.53,10067.32,45030.26,21465.10,26809.46,...,48616.624278,47353.473178,51039.033520,-3685.560341,0.927789,False,-0.032551,0.032551,3685.560341,i
a,,20853.49,22240.78,20346.25,21357.82,22557.53,15632.54,20755.41,22470.99,23270.21,...,24075.924461,24496.357873,23269.637880,1226.719993,1.052718,False,0.022312,0.022312,1226.719993,a
in,,18307.46,17306.20,17775.94,17335.58,17055.21,6702.35,24739.06,13318.05,16743.76,...,24198.050975,23137.594312,26231.742766,-3094.148454,0.882046,False,-0.054509,0.054509,3094.148454,i
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
extratropical,,0.00,0.03,0.00,0.01,0.00,0.00,0.00,0.00,0.00,...,0.000000,0.000000,0.000000,0.000000,,True,,,0.000000,j
spathe,,0.00,0.03,0.01,0.03,0.01,0.00,0.00,0.00,0.00,...,0.000000,0.000000,0.000000,0.000000,,True,,,0.000000,n
ovules,,0.09,0.03,0.02,0.05,0.11,0.00,0.33,0.00,0.00,...,0.000000,0.000000,0.000000,0.000000,,True,,,0.000000,n
gweilos,,0.00,0.03,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000000,0.000000,0.000000,0.000000,,True,,,0.000000,n


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm
from osp import get_corpus_metadata, HashStash

# 1. Load Metadata
df = get_corpus_metadata()
# Filter for just Philosophy and Literature
df_filtered = df[df['discipline'].isin(['Philosophy', 'Literature'])]
ids = df_filtered.index.tolist()
y = df_filtered['discipline'].values

# 2. Load Features (Token Counts)
# Assuming 'osp_counts' stash contains the counts as shown in GetFeatures.ipynb
STASH_COUNTS = HashStash('osp_counts')
features = []

print("Loading features...")
for doc_id in tqdm(ids):
    # Retrieve counts from stash
    counts = STASH_COUNTS.get(doc_id, {})
    features.append(counts)

# 3. Vectorize features
print("Vectorizing features...")
v = DictVectorizer(sparse=True)
X = v.fit_transform(features)

# 4. Initialize LOO and Logistic Regression
loo = LeaveOneOut()
clf = LogisticRegression(max_iter=1, solver='liblinear') # liblinear is often good for smaller/sparse datasets

# 5. Run LOO Classification
# cross_val_predict with LOO is equivalent to the manual loop
print("Running Leave-One-Out Classification...")
y_pred = cross_val_predict(clf, X, y, cv=loo, n_jobs=-1)

# 6. Report Results
print("\n--- Classifier Results (Phil vs Lit) ---")
print(f"Accuracy: {accuracy_score(y, y_pred):.4f}")
print("\nConfusion Matrix:")
print(pd.crosstab(pd.Series(y, name='Actual'), pd.Series(y_pred, name='Predicted')))
print("\nClassification Report:")
print(classification_report(y, y_pred))

# Optional: Inspect top features
clf.fit(X, y) # Fit on all data to see overall coefficients
feature_names = v.get_feature_names_out()
coefs = clf.coef_[0]
top_phil = np.argsort(coefs)[:10]
top_lit = np.argsort(coefs)[-10:]

print("\nTop features for Literature:")
for i in reversed(top_lit):
    print(f"{feature_names[i]}: {coefs[i]:.4f}")

print("\nTop features for Philosophy:")
for i in top_phil:
    print(f"{feature_names[i]}: {coefs[i]:.4f}")

Loading features...


100%|██████████| 58126/58126 [00:21<00:00, 2734.49it/s]


Vectorizing features...
Running Leave-One-Out Classification...


KeyboardInterrupt: 

In [3]:
# !pip install scikit-learn