In [33]:
import pdfplumber
import csv
import nltk
nltk.download('punkt')

# USE THIS FUNCTION TO EXTRACT SENTENCES FROM A PDF AND SAVE THEM TO A CSV FILE
def extract_sentences_to_csv(pdf_path, csv_path, start_page=0, end_page=None):
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        end_page = end_page or total_pages

        with open(csv_path, mode='a', newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile)

            for page_number in range(start_page, min(end_page, total_pages)):
                page = pdf.pages[page_number]
                text = page.extract_text()
                if text:
                    # Tokenizing text into sentences
                    sentences = nltk.sent_tokenize(text)
                    # Writing each sentence as a new row in the CSV
                    for sentence in sentences:
                        csv_writer.writerow([sentence.strip()])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samdisorbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
#use the function above
extract_sentences_to_csv('demos.pdf', 'demos.csv')

In [1]:
#make the csv file a dataframe
import pandas as pd
df = pd.read_csv('demos.csv', header=None)

In [3]:
#clean the text in each element of the paragraphs list. Turn the letters to lowercase, remove non-alphabetic characters, and remove extra whitespace. Additionally tokenize all of the words. 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

#get words
from nltk.corpus import words

#cleaning function
def clean_paragraph(paragraph):


    #remove roman numerals
    paragraph = re.sub(r'\b(?=[MDCLXVI]+\b)M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b', '', paragraph)
    
    # Make all text lowercase
    paragraph = paragraph.lower()
    
    # Remove non-alphabetic characters
   # paragraph = re.sub(r'[^a-zA-Z]', ' ', paragraph)

    # Tokenize the paragraph
    paragraph = word_tokenize(paragraph)
    
    # Remove stopwords
   # paragraph = [word for word in paragraph if word not in stopwords.words("english")]

    #remove non english words
   # paragraph = [word for word in paragraph if word in words.words()]
    
    # Remove punctuation
   # paragraph = [word for word in paragraph if word not in string.punctuation]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    paragraph = [lemmatizer.lemmatize(word) for word in paragraph]
    
    # Join words back into a paragraph
    paragraph = " ".join(paragraph)
    
    return paragraph



In [4]:

#remove the phrases "the orations of demosthenes" and "the first philippic" from the dataframe
df = df[df[0] != 'the orations of demosthenes']
df = df[df[0] != 'the first philippic']


#remove one character words 
#df = df[df[0].str.split().str.len().gt(1)]



# Apply the clean_paragraph function to the column in df
df[0] = df[0].apply(clean_paragraph)
#remove rows where the paragraph is less than 10 words
df = df[df[0].str.split().str.len().gt(5)]


#remove one character words 
#df = df[df[0].str.split().str.len().gt(1)]


#remove the first 700 rows
df = df.iloc[700:]

#rename the column with the sentences to 'text'
df = df.rename(columns={0: 'text'})

In [128]:
corpus = [word.split() for word in df['text']]

#train a word2vec model on the text in the df sentence column
from gensim.models import Word2Vec
model = Word2Vec(corpus, vector_size=300, window=10, min_count=5, epochs=150, workers=4)

#save the model
model.save('demos.model')

#load the model
from gensim.models import Word2Vec
model = Word2Vec.load('demos.model')


In [135]:

#print the most similar word to athens
(model.wv.most_similar('war'))

[('storm', 0.22965008020401),
 ('aggression', 0.22426503896713257),
 ('attempting', 0.21382057666778564),
 ('surely', 0.2056763619184494),
 ('mount', 0.20268258452415466),
 ('mountain', 0.1963820904493332),
 ('destroy', 0.1940629482269287),
 ('persuaded', 0.1903166025876999),
 ('prevail', 0.18888278305530548),
 ('longer', 0.1868620067834854)]

In [149]:
#import spacy and make the cols
import spacy
nlp = spacy.load('en_core_web_sm')

#make the columns
df['subject'] = ''
df['action'] = ''
df['object'] = ''


#check each sentence for the subject, action, and object
for i in range(len(df)):
    doc = nlp(df['text'].iloc[i])
    for token in doc:
        if token.dep_ == 'nsubj':
            df['subject'].iloc[i] = token.text
        if token.dep_ == 'ROOT':
            df['action'].iloc[i] = token.text
        if token.dep_ == 'dobj':
            df['object'].iloc[i] = token.text


In [152]:
#split the data into test and training 15% test and save thenm as csv files
from sklearn.model_selection import train_test_split

#remove 1 letter words from the text column
df = df[df['text'].str.split().str.len().gt(1)]

train, test = train_test_split(df, test_size=0.15)

train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)