In [1]:
import pandas as pd
import numpy as numpy
import re
import os

# Project Gutenberg - Nietzsche's English Text

In [11]:
# This function reads single sentences from a book file
def read_book(book_dir):
    book_name = book_dir[(book_dir.rfind('/')+1):-4]
    
    book = open(book_dir).read()
    df = pd.DataFrame({'text':[], 'book':[]})
    paragraphs = [t for t in book.split('\n\n') if len(t) > 40]
    
    for p in paragraphs:
        p = re.sub(r'\n', ' ', p)
        s = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', p)
        for sentence in s:
            if len(sentence) > 0:
                sentence = sentence.strip()
                sentence = re.sub(r'[ \t]{3,}', ' ', sentence)
                sentence = sentence.replace('--', '')
                sentence = re.sub(r'[_\"\“\”—]+', '', sentence)
                sentence = sentence.replace('.', '. ')
                df = df.append({'text':sentence, 'book':book_name}, ignore_index=True)
            
    return df

In [12]:
# Reading all files downloaded from Project Gutenberg
file_location = 'data/Project_Gutenberg/'
all_files = os.listdir(file_location)
txt_files = list(filter(lambda x: x[-4:] == '.txt', all_files))
len(txt_files)

19

In [13]:
# Storing all Nietzsche's work in one dataframe
books_df = pd.DataFrame({'text':[], 'book':[]})

for f in txt_files:
    f = file_location + f
    df = read_book(f)
    books_df = books_df.append(df, ignore_index=True)

In [14]:
# filtering out titles/sentences that are too short
books_df = books_df[books_df.text.apply(lambda x: True if len(x)>20 else False)]
books_df.reset_index(inplace=True, drop=True)

In [15]:
# Checking if there's weird punctuations
books_df.sample(10)

Unnamed: 0,text,book
25014,The neglect and the surrender of Life and of w...,will1-2
17397,"If I should, perhaps, become more garrulous th...",thoughts1
2274,In the art of spurring exhausted creatures bac...,contra_wagner
260,"Originally, and above all in the time of the m...",antichrist
18879,Why not a poet or orator?,thoughts2
10481,But so long as its influence lasts it should e...,human1
20897,"For the words, it is argued, are as much noble...",tragedy
6443,They are recognised by the fact that any such ...,eccehomo
23837,Just as the sharp demarcation of nations gener...,will1-2
12939,A KIND OF REST AND CONTEMPLATION. Beware lest ...,human2


In [16]:
# matching books to their published year
book_year = {'antichrist':1895, 'beyond':1886, 'contra_wagner':1889,
            'dawn':1881, 'early_greek':1871, 'eccehomo':1888, 'eduinst':1872, 'homer': 1869,
            'human1':1878, 'human2':1880, 'thoughts1': 1874, 'thoughts2':1876, 'twilight':1888, 
            'will1-2': 1901, 'will3-4': 1901, 'zarathustra': 1883, 'tragedy':1872,
            'joyful':1887, 'morals':1887}

books_df['year'] = books_df.book.apply(lambda x: book_year[x])

In [17]:
# Label: whether the book is published after Nietzsche's resignation from university
books_df['after_1879'] = books_df.year.apply(lambda x: 1 if x > 1879 else 0)

In [18]:
# Label: if the book is published by Nietzsche's sister
elisabeth_published = ['antichrist', 'eccehomo', 'will1-2', 'will3-4']
books_df['elisabeth_published'] = books_df.book.apply(lambda x: 1 if x in elisabeth_published else 0)

In [19]:
books_df.to_csv('data/nietzsche_gutenberg.csv', index=False)

In [20]:
books_df.shape

(34241, 5)

# Other Philosophers

In [22]:
# I'd also like to compare Nietzsche to his contemporaries. 
# A compilation of renowned Western philosopher can be found in this Kaggle dataset:
# https://www.kaggle.com/kouroshalizadeh/history-of-philosophy
phil_df = pd.read_csv('data/phil_nlp.csv')

In [30]:
phil_df.columns

Index(['title', 'author', 'school', 'sentence_spacy', 'sentence_str',
       'sentence_length', 'sentence_lowered', 'tokenized_txt',
       'lemmatized_str'],
      dtype='object')

In [32]:
# A view of schools and authors available in this dataset
phil_df.groupby(['school', 'author'])['sentence_str'].count()

school           author         
analytic         Kripke             12480
                 Lewis              13120
                 Moore               3668
                 Popper              4678
                 Quine               7373
                 Russell             5073
                 Wittgenstein        9035
aristotle        Aristotle          48779
capitalism       Keynes              3411
                 Ricardo             3090
                 Smith              11693
communism        Lenin               4469
                 Marx               13490
continental      Deleuze            12540
                 Derrida             6001
                 Foucault           15240
empiricism       Berkeley            2734
                 Hume                8312
                 Locke               8885
german_idealism  Fichte              5308
                 Hegel              22701
                 Kant               14128
phenomenology    Heidegger          15239
 

In [44]:
# Extract modern philosophers from the dataframe
exclude_list = ['aristotle', 'plato', 'stoicism']

modern_df = phil_df[phil_df.school.apply(lambda x: True if x not in exclude_list else False)]

modern_df.drop('sentence_spacy', axis=1, inplace=True)
modern_df.reset_index(inplace=True, drop=True)
modern_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,title,author,school,sentence_str,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
0,Second Treatise On Government,Locke,empiricism,"Reader, thou hast here the beginning and end o...",237,"reader, thou hast here the beginning and end o...","['reader', 'thou', 'hast', 'here', 'the', 'beg...","reader , thou hast here the beginning and end..."
1,Second Treatise On Government,Locke,empiricism,"These, which remain, I hope are sufficient to ...",488,"these, which remain, i hope are sufficient to ...","['these', 'which', 'remain', 'hope', 'are', 's...","these , which remain , -PRON- hope be suffici..."
2,Second Treatise On Government,Locke,empiricism,"If these papers have that evidence, I flatter ...",444,"if these papers have that evidence, i flatter ...","['if', 'these', 'papers', 'have', 'that', 'evi...","if these paper have that evidence , -PRON- fl..."
3,Second Treatise On Government,Locke,empiricism,"The king, and body of the nation, have since s...",721,"the king, and body of the nation, have since s...","['the', 'king', 'and', 'body', 'of', 'the', 'n...","the king , and body of the nation , have sinc..."
4,Second Treatise On Government,Locke,empiricism,If he think it not worth while to examine his ...,263,if he think it not worth while to examine his ...,"['if', 'he', 'think', 'it', 'not', 'worth', 'w...",if -PRON- think -PRON- not worth while to exa...


In [45]:
modern_df.to_csv('data/modern_phil.csv', index=False)