In [1]:
import pandas as pd
import nltk
import numpy as np
import pickle

# Getting the labeled vocabulary

In [2]:
labeled_words = pd.read_csv('nrc/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', delimiter="\t", header=None)
labeled_words.columns = ['word', 'emotion', 'label']
labeled_words['word'] = labeled_words['word'].fillna(value='unk')
labeled_words.head()

Unnamed: 0,word,emotion,label
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0


In [3]:
labeled_words = labeled_words.pivot(index='word', columns='emotion')
labeled_words

Unnamed: 0_level_0,label,label,label,label,label,label,label,label,label,label
emotion,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
word,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
aback,0,0,0,0,0,0,0,0,0,0
abacus,0,0,0,0,0,0,0,0,0,1
abandon,0,0,0,1,0,1,0,1,0,0
abandoned,1,0,0,1,0,1,0,1,0,0
abandonment,1,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
zone,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0
zoological,0,0,0,0,0,0,0,0,0,0
zoology,0,0,0,0,0,0,0,0,0,0


In [4]:
vocab = labeled_words.index.tolist()

In [5]:
with open('vocab.pkl', 'wb') as pkl_file:
    pickle.dump(vocab, pkl_file)
with open('labeled_words.pkl', 'wb') as pkl_file:
    pickle.dump(labeled_words, pkl_file)

# Paragraphs from Gutenberg Project books

## Starting with a single book

In [6]:
import glob
BOOKS_PATH = 'gutenberg_data/'
all_books = glob.glob(BOOKS_PATH + '*')
with open(all_books[0], 'r') as book:
#     book_sample = pd.DataFrame(book.readlines(), columns=['text'])
    book_sample = pd.read_csv(book, delimiter='\n \n', header=None, names=['text'])
book_sample

  book_sample = pd.read_csv(book, delimiter='\n \n', header=None, names=['text'])


Unnamed: 0,text
0,The Project Gutenberg EBook of The Confessions...
1,This eBook is for the use of anyone anywhere a...
2,almost no restrictions whatsoever. You may co...
3,re-use it under the terms of the Project Guten...
4,with this eBook or online at www.gutenberg.org
...,...
2536,https://www.gutenberg.org
2537,This Web site includes information about Proje...
2538,including how to make donations to the Project...
2539,"Archive Foundation, how to help produce our ne..."


In [7]:
paragraphs = []
par = []
with open(all_books[0], 'r') as paragraphs_file:
    for sentence in paragraphs_file: #one line until the end of file
        if sentence != "\n \n" and sentence != "\n" and sentence != "\n\n" and sentence != '':
            par.append(sentence)
        else:
            paragraphs.append(" ".join(par))
            par = []
        

In [8]:
def get_paragraphs(filename):
    paragraphs = []
    par = []
    with open(filename, 'r') as paragraphs_file:
        for sentence in paragraphs_file: #one line until the end of file
            if sentence != "\n \n" and sentence != "\n" and sentence != "\n\n" and sentence != '':
                par.append(sentence)
            else:
                paragraphs.append(" ".join(par))
                par = []
    return paragraphs

In [9]:
pd.DataFrame(paragraphs, columns=['text'])

Unnamed: 0,text
0,The Project Gutenberg EBook of The Confessions...
1,This eBook is for the use of anyone anywhere a...
2,
3,Title: The Confessions of a Poacher\n
4,Author: Anonymous\n
...,...
380,
381,Project Gutenberg-tm eBooks are often created ...
382,
383,Most people start at our Web site which has th...


Finding out when the book actually starts

In [10]:
START_STRING = 'START OF THIS PROJECT GUTENBERG EBOOK'
END_STRING = 'END OF THIS PROJECT GUTENBERG EBOOK'

In [11]:
start_idx = book_sample[book_sample['text'].str.contains(START_STRING)].index[0] + 1
book = book_sample.iloc[start_idx:].reset_index(drop=True)
end_idx = book[book['text'].str.contains(END_STRING)].index[0]
book = book[:end_idx].reset_index(drop=True)
book = book.drop_duplicates(ignore_index=True)

Removing newlines (getting paragraphs)

In [12]:
book.drop_duplicates().reset_index(drop=True).iloc[50].to_markdown()

'|      | 50                                  |\n|:-----|:------------------------------------|\n| text | 7. SALMON AND TROUT POACHING     90 |'

Get book id:

## Pipeline

In [13]:
text = []
book_ids = []
paragraph_ids = []
error = []
for book_file in all_books:
    paragraphs = get_paragraphs(book_file)
    book_df = pd.DataFrame(paragraphs, columns=['text'])
    book_id = int(book_file.split('/')[-1].replace('.txt', ''))
    try:
        start_idx = book_df[book_df['text'].str.contains(START_STRING)].index[0] + 1
        book = book_df.iloc[start_idx:].reset_index(drop=True)
        end_idx = book[book['text'].str.contains(END_STRING)].index[0]
        book = book[:end_idx].reset_index(drop=True)
        book = book.drop_duplicates(ignore_index=True)
        for i in range(len(book)):
            book_ids.append(int(book_id))
            paragraph_ids.append(f'{book_id}_{i}')
        text.append(book.values.ravel().tolist())
    except IndexError:
        error.append(book_id)


In [14]:
import numpy as np

In [15]:
dict_ = {
    'book_id': np.array(book_ids),
    'paragraph_id': np.array(paragraph_ids),
    'text': [item for sublist in text for item in sublist]
}

In [16]:
df = pd.DataFrame(dict_)

In [17]:
df.head()

Unnamed: 0,book_id,paragraph_id,text
0,36970,36970_0,
1,36970,36970_1,"Produced by David Edwards, Linda Hamilton and ..."
2,36970,36970_2,"""Poaching is one of the fine arts--how 'fine' ..."
3,36970,36970_3,[Illustration: THE SQUIRE'S KEEPER.]\n
4,36970,36970_4,The\n Confessions\n of a\n Poacher\n


In [18]:
df.shape

(203334, 3)

In [19]:
df.to_csv('gutenberg_data.csv')

# Emotion from paragraphs

## Preprocess paragraphs

In [20]:
vocab

['aback',
 'abacus',
 'abandon',
 'abandoned',
 'abandonment',
 'abate',
 'abatement',
 'abba',
 'abbot',
 'abbreviate',
 'abbreviation',
 'abdomen',
 'abdominal',
 'abduction',
 'aberrant',
 'aberration',
 'abeyance',
 'abhor',
 'abhorrent',
 'abide',
 'ability',
 'abject',
 'ablation',
 'ablaze',
 'abnormal',
 'aboard',
 'abode',
 'abolish',
 'abolition',
 'abominable',
 'abomination',
 'aboriginal',
 'abort',
 'abortion',
 'abortive',
 'abound',
 'abovementioned',
 'abrasion',
 'abroad',
 'abrogate',
 'abrupt',
 'abruptly',
 'abscess',
 'absence',
 'absent',
 'absentee',
 'absenteeism',
 'absinthe',
 'absolute',
 'absolution',
 'absorbed',
 'absorbent',
 'absorbing',
 'absorption',
 'abstain',
 'abstention',
 'abstinence',
 'abstract',
 'abstraction',
 'absurd',
 'absurdity',
 'abundance',
 'abundant',
 'abuse',
 'abutment',
 'aby',
 'abysmal',
 'abyss',
 'academic',
 'academy',
 'accede',
 'accelerate',
 'acceleration',
 'accent',
 'accentuate',
 'accept',
 'acceptable',
 'acceptan

In [21]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer()
cv.fit(vocab)
with open('count_vectorizer.pkl', 'wb') as pkl_file:
    pickle.dump(cv, pkl_file)

In [22]:
df['text'] = df['text'].str.replace('\n', '')

In [23]:
X = df['text'].str.replace('[^\w\s]', '', regex=True).str.lower()

In [24]:
X = X.map(nltk.word_tokenize)
X

0                                                        []
1         [produced, by, david, edwards, linda, hamilton...
2         [poaching, is, one, of, the, fine, artshow, fi...
3                      [illustration, the, squires, keeper]
4                        [the, confessions, of, a, poacher]
                                ...                        
203329    [57, practical, bait, casting, by, larry, st, ...
203330    [all, outdoors, a, monthly, magazine, of, the,...
203331    [yachting, an, illustrated, monthly, magazine,...
203332    [outing, for, more, than, thirty, years, the, ...
203333    [end, of, the, project, gutenberg, ebook, of, ...
Name: text, Length: 203334, dtype: object

# Returning sentiment

In [25]:
def replace_if_not_in_vocab(lst_token):
    result = []
    for t in lst_token:
        try:
            idx = vocab.index(t)
            result.append(t)
        except ValueError:
            result.append('unk')
    return result

In [26]:
idx = 12345
test = " ".join(replace_if_not_in_vocab(X.iloc[idx]))
X_cv = cv.transform([test]).toarray()
counter = labeled_words.iloc[:, :10].values.T @ X_cv[0]
pred = labeled_words.columns[np.argmax(counter)][1]
print(f"Phrase:\n{df['text'].iloc[idx]}")
print(f"Prediction: {pred}")

Phrase:
“Oh, yes: often. But never on that day.”
Prediction: anger


Necessary
- Save CV and labeled words
Advanced
- If word not in vocab, check similar word using nltk/spacy
