In [191]:
import warnings
import pdfminer
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import sklearn
import random
import string
import seaborn as sns
import math
from matplotlib import pyplot as plt
import re
import scipy

import requests
from bs4 import BeautifulSoup

from collections import Counter
import spacy

from nltk.corpus import gutenberg
import nltk

from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer, ChatterBotCorpusTrainer
from chatterbot.conversation import Statement
import markovify

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

**Using pdfminer to convert book pdf to text**

In [45]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [46]:
%%time
path = '/Users/kristiancanler/Desktop/Heidegger-Martin-Being-and-Time-trans.-Macquarrie-Robinson-Blackwell-1962.pdf'
zus = convert_pdf_to_txt(path)

CPU times: user 7min 12s, sys: 3.54 s, total: 7min 16s
Wall time: 7min 29s


In [49]:
# Python code to create a file 
file = open('/Users/kristiancanler/Desktop/zus.txt','w') 
file.write(zus) 
file.close() 

**Cleaning text**

In [None]:
# Opening and reading text file
zus = open('/Users/kristiancanler/Desktop/zus.txt','r').read()

In [95]:
# utility function for standard text cleaning
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  better get rid of it now!
    text = re.sub(r'--',' ', text)
    text = re.sub(r'---',' ', text)
    text = re.sub(r"[\[].*?[\]]", "", text)
    text = re.sub(r"(, )+", " ", text)
    text = re.sub(r", +", "", text)
    text = re.sub(r"H.", "", text)
    text = re.sub(r"[\w\d]+�[\d\w]+", "", text)
    text = re.sub(r"\d", "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

# Cleaning with function
zus_cleaned = text_cleaner(zus)
zus_cleaned



In [6]:
# parse the cleaned novels. this can take a bit.
nlp = spacy.load('en_core_web_sm')

In [8]:
%%time
# tokenize persuasion
zus_doc = nlp(zus[:1000000])

CPU times: user 53.2 s, sys: 31.1 s, total: 1min 24s
Wall time: 1min 59s


In [9]:
%%time
# tokenize persuasion
zus_doc2 = nlp(zus[1000000:])

CPU times: user 48.1 s, sys: 23.6 s, total: 1min 11s
Wall time: 1min 32s


In [118]:
zus_tokens = [token for token in zus_doc]
zus_tokens.extend([token for token in zus_doc2])

In [50]:
# removing stopwords
zus_no_stops = [token for token in zus_doc
                if not token.is_stop]
zus2_no_stops = [token for token in zus_doc2
                 if not token.is_stop]

In [51]:
# combining token lists
zus_no_stops.extend(zus2_no_stops)

In [60]:
# utility function to calculate how frequently words appear in the text
def word_frequencies(text):
    
    # build a list of words
    # strip out punctuation
    words = []
    for token in text:
        if not token.is_punct:
            words.append(token.text)
            
    # build and return a Counter object containing word counts
    return Counter(words)

# instantiate our list of most common words.
word_freq = word_frequencies(zus_no_stops).most_common(10)
word_freq

[('\n\n', 43323),
 ('\n', 36369),
 (' ', 10712),
 ('H.', 4524),
 ('Dasein', 2563),
 ('world', 1398),
 ('1', 1344),
 ('way', 987),
 ('hand', 976),
 ('time', 935)]

In [121]:
# Using sentences with more than one character
zus_sentences = pd.Series([sent.text for sent in zus_doc.sents
                              if len(sent.text) > 4])
zus_sentences2 = pd.Series([sent.text for sent in zus_doc2.sents
                              if len(sent.text) > 4])

In [122]:
zus_sentences = pd.concat([zus_sentences, zus_sentences2])

index 9010 is the breakpoint between division one and division two.

In [162]:
zus_df = pd.DataFrame({'sentences': zus_sentences})
zus_df['divisionvar'] = np.nan
zus_df.iloc[:9009, 1] = 'one'
zus_df.iloc[9009:, 1] = 'two'

In [138]:
zus_df.shape

(22339, 2)

**Experimenting with Markov Chains**

In [84]:
# Generating transition probabilities with transition state size of 3 words
zus_generator = markovify.Text(zus_sentences, state_size = 3)

In [85]:
# three randomly generated sentences
for i in range(3):
    print(zus_generator.make_sentence())

print('\n')

# three randomly-generated sentences of no more than 100 characters
for i in range(3):
    print(zus_generator.make_short_sentence(100))

None
What is decisive out of the province and free them.
The possibility broken off lies in its existen to Dasein.


which is as a basic state-of-mind, belongs to Dasein's ontological man.
I is to On the other in the sense that it leaps ahead, is evidence enough.
is not the sort of thing is not to get it in the right way.


**Vectorizing text with tf-idf**

In [164]:
indexed_zus_df = zus_df.reset_index()

In [169]:
indexed_zus_df.columns

Index(['index', 'sentences', 'divisionvar'], dtype='object')

In [165]:
%%time
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=4,
    use_idf=True,
    norm='l2',
    smooth_idf=True)

# applying the vectorizer
X = vectorizer.fit_transform(zus_df.sentences)

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_data = pd.concat((tfidf_df, indexed_zus_df), axis=1)

CPU times: user 5.47 s, sys: 1.46 s, total: 6.93 s
Wall time: 7.08 s


**Exploring features**

In [182]:
tfidf_data.shape

(22339, 4768)

**Fitting logistic regression and random forest**

In [171]:
y = tfidf_data.divisionvar

X = np.array(tfidf_data.drop(['index', 'sentences', 'divisionvar'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [175]:
%%time
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

CPU times: user 106 µs, sys: 148 µs, total: 254 µs
Wall time: 305 µs


In [176]:
%%time
lr.fit(X_train, y_train)

CPU times: user 550 ms, sys: 527 ms, total: 1.08 s
Wall time: 1.22 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [177]:
%%time
rfc.fit(X_train, y_train)

CPU times: user 28.6 s, sys: 560 ms, total: 29.2 s
Wall time: 29.7 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [181]:
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('Test set score:', lr.score(X_test, y_test))

print("\n----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('Test set score:', rfc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.8198503549274158
Test set score: 0.759773202029245

----------------------Random Forest Scores----------------------
Training set score: 0.9686001151115943
Test set score: 0.705162638018502


The scores for logisitic regression are closer, and the test set score is higher.