# NLP Project analyzing works of Ancient Greek Poet Hesiod

Loading Text from Project Gutenberg

In [None]:
# import packages
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


In [None]:
# Open File
# DO NOT RUN
f = open('hesoid.txt','r', encoding="utf8")
raw = f.read()
raw


In [3]:
# Tokenize 
tokens = nltk.word_tokenize(raw)
# Checking lenght
len(tokens)

100215

In [4]:
# List comprehension
tokens_without_sw = [word for word in tokens if not word in stopwords.words('english')]


In [5]:
# checking length to confirm stopwords were removed
len(tokens_without_sw)

62022

In [6]:
#Tokenize sentences
sentences = sent_tokenize(raw)
#sentences

PUT sentences in DF


In [7]:
from pandas import DataFrame
# Put tokenized sentences in dataframe

Hesiod = DataFrame (sentences,columns=['tokenized_sentences'])
Hesiod.sample(9)

Unnamed: 0,tokenized_sentences
765,There will be no favour for the man who keeps ...
578,"Γ Brussels, Bibl."
898,(ll 493-501) Pass by the smithy and its crowde...
3248,"17-19) Hail to you, lord!"
1343,The\nhot vapour lapped round the earthborn Tit...
583,"G Rome, Vatican, Regina 91 (16th cent.)."
3426,"Here, too, Achilles quarrels with Agamemnon\nb..."
1589,So she bare sons to the almighty Son of Cronos...
2180,"And he\nsays there were four of them, Argus, P..."


In [None]:
# new column for lenth of sentences
Hesiod["len"] = Hesiod['tokenized_sentences'].apply(len)

In [None]:
# Visulization of sentence lenght
import matplotlib as plt
import seaborn as sns

In [None]:
sns.distplot(Hesiod["len"], kde=False)

# Pre-Processing 
There are many feature engineering strategies for transforming text data into features. Some involve assigning each unique word-like term to a feature and counting the number of occurrences per training example. However, if we were to perform this strategy right now, we'd end up with an absurd number of features, a result of the myriad possible terms. The classifier would take too long to train and likely overfit. As a result, each NLP problem requires a tailored approach to determine which terms are relevant and meaningful, and this is where we begin our pre-processing.


can't get contractions installed

# Tokenization

Step 2: Tokenization

In this step, we construct the features. We will begin by breaking apart the corpus into a vocabulary of unique terms, and this is called tokanization.

We can tokenize individual terms and generate what's called a bag of words model. You may notice this model has a glaring pitfall: it fails to capture the innate structure of human language. We can also tokenize using nltk, which is the leading platform for building Python programs to work with human language data.

We will begin my installing and importing nltk, so we can use it!


In [8]:
from nltk.tokenize import word_tokenize

In [9]:
Hesiod['word_tokens'] = Hesiod['tokenized_sentences'].apply(word_tokenize)
Hesiod.head()

Unnamed: 0,tokenized_sentences,word_tokens
0,"\n\n\n\n\nHesiod, The Homeric Hymns, and Homer...","[Hesiod, ,, The, Homeric, Hymns, ,, and, Homer..."
1,_The Works and Days_\n II.,"[_The, Works, and, Days_, II, .]"
2,The Genealogical Poems\n Date of the Hesiodic ...,"[The, Genealogical, Poems, Date, of, the, Hesi..."
3,TO DIONYSUS\n II.,"[TO, DIONYSUS, II, .]"
4,TO DEMETER\n III.,"[TO, DEMETER, III, .]"


In [11]:
Hesiod['tokens_no_stopwords']= [word for word in tokens if not word in stopwords.words('english')]


ValueError: Length of values (62022) does not match length of index (3872)

In [12]:
# make lowercase
Hesiod['lower'] = Hesiod['word_tokens'].apply(lambda x: [word.lower() for word in x])
Hesiod.head()

Unnamed: 0,tokenized_sentences,word_tokens,lower
0,"\n\n\n\n\nHesiod, The Homeric Hymns, and Homer...","[Hesiod, ,, The, Homeric, Hymns, ,, and, Homer...","[hesiod, ,, the, homeric, hymns, ,, and, homer..."
1,_The Works and Days_\n II.,"[_The, Works, and, Days_, II, .]","[_the, works, and, days_, ii, .]"
2,The Genealogical Poems\n Date of the Hesiodic ...,"[The, Genealogical, Poems, Date, of, the, Hesi...","[the, genealogical, poems, date, of, the, hesi..."
3,TO DIONYSUS\n II.,"[TO, DIONYSUS, II, .]","[to, dionysus, ii, .]"
4,TO DEMETER\n III.,"[TO, DEMETER, III, .]","[to, demeter, iii, .]"


In [16]:
# take out punctionation
import string
punc = string.punctuation
Hesiod['no_punc'] = Hesiod['lower'].apply(lambda x: [word for word in x if word not in punc])
Hesiod.head()

Unnamed: 0,tokenized_sentences,word_tokens,lower,no_punc
0,"\n\n\n\n\nHesiod, The Homeric Hymns, and Homer...","[Hesiod, ,, The, Homeric, Hymns, ,, and, Homer...","[hesiod, ,, the, homeric, hymns, ,, and, homer...","[hesiod, the, homeric, hymns, and, homerica, b..."
1,_The Works and Days_\n II.,"[_The, Works, and, Days_, II, .]","[_the, works, and, days_, ii, .]","[_the, works, and, days_, ii]"
2,The Genealogical Poems\n Date of the Hesiodic ...,"[The, Genealogical, Poems, Date, of, the, Hesi...","[the, genealogical, poems, date, of, the, hesi...","[the, genealogical, poems, date, of, the, hesi..."
3,TO DIONYSUS\n II.,"[TO, DIONYSUS, II, .]","[to, dionysus, ii, .]","[to, dionysus, ii]"
4,TO DEMETER\n III.,"[TO, DEMETER, III, .]","[to, demeter, iii, .]","[to, demeter, iii]"


In [19]:
# take outstop words
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

Hesiod['stopwords_removed'] = Hesiod['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
Hesiod.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lizba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tokenized_sentences,word_tokens,lower,no_punc,stopwords_removed
0,"\n\n\n\n\nHesiod, The Homeric Hymns, and Homer...","[Hesiod, ,, The, Homeric, Hymns, ,, and, Homer...","[hesiod, ,, the, homeric, hymns, ,, and, homer...","[hesiod, the, homeric, hymns, and, homerica, b...","[hesiod, homeric, hymns, homerica, homer, hesi..."
1,_The Works and Days_\n II.,"[_The, Works, and, Days_, II, .]","[_the, works, and, days_, ii, .]","[_the, works, and, days_, ii]","[_the, works, days_, ii]"
2,The Genealogical Poems\n Date of the Hesiodic ...,"[The, Genealogical, Poems, Date, of, the, Hesi...","[the, genealogical, poems, date, of, the, hesi...","[the, genealogical, poems, date, of, the, hesi...","[genealogical, poems, date, hesiodic, poems, l..."
3,TO DIONYSUS\n II.,"[TO, DIONYSUS, II, .]","[to, dionysus, ii, .]","[to, dionysus, ii]","[dionysus, ii]"
4,TO DEMETER\n III.,"[TO, DEMETER, III, .]","[to, demeter, iii, .]","[to, demeter, iii]","[demeter, iii]"


In [None]:
url = 'http://www.gutenberg.org/files/348/348-0.txt'

In [None]:
response = request.urlopen(url)
text = response
text.words
#url = 'http://www.gutenberg.org/files/348/348-0.txt'