In [85]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset into a pandas DataFrame
data = pd.read_csv('data/data_w_subj.csv', index_col=0)
column_names = data.columns.tolist()
print(column_names)


['author_id', 'author_name', 'book_id', 'gutenbergbookid', 'title', 'text', 'text_lines', 'subjects']


In [86]:

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the text data
X = vectorizer.fit_transform(data['text'])

# Print the vocabulary and the matrix of word counts
print("Vocabulary: ", vectorizer.vocabulary_)
print("Matrix of word counts: ", X.toarray())


Matrix of word counts:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [87]:
vector = vectorizer.transform(data)
print(vector.shape)
print(type(vector))
print(vector.toarray())

(8, 87719)
<class 'scipy.sparse._csr.csr_matrix'>
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [88]:
vector.toarray()
df = pd.DataFrame(vector.todense())
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87709,87710,87711,87712,87713,87714,87715,87716,87717,87718
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
import nltk
import pandas as pd

# download the stopwords corpus if not already downloaded
nltk.download('stopwords')

# load the data from csv file
data = pd.read_csv('data/data_w_subj.csv')

# create an instance of the English stop words list
stop_words = nltk.corpus.stopwords.words('english')

# define a function to tokenize and remove stop words from a given text
def preprocess_text(text):
    # tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    # remove the stop words
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# apply the function to the 'text' column of the dataframe
data['text_tokenized'] = data['text'].apply(preprocess_text)
# save the preprocessed data to a new csv file
data.to_csv('data/preprocessed_data.csv', index=False)
data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeffereyreng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,text_tokenized
0,0,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['', '', '', 'CHAPTER VIII.', '', 'JAMES TRIPL...",3693,"['Science fiction', ' Short stories']","[[, ``, ,, ``, ,, ``, ,, 'chapter, viii, ., ',..."
1,1,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['life;"" and her shoulders went up to her ears...",1643,"['Science fiction', ' Short stories']","[[, 'life, ;, '', shoulders, went, ears, --, f..."
2,2,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['various talents, and are come to make a dema...",4143,"['Science fiction', ' Short stories']","[[, 'various, talents, ,, come, make, demand, ..."
3,3,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['would be a lasting discredit,"" she continued...",3393,"['Science fiction', ' Short stories']","[[, 'would, lasting, discredit, ,, '', continu..."
4,4,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,['homely which he invented in our first chapte...,2693,"['Science fiction', ' Short stories']","[[, 'homely, invented, first, chapter, ,, proc..."
...,...,...,...,...,...,...,...,...,...,...
2439,2439,7150,"Blish, James",62429,59415,To Pay the Piper,"['Pneumococcus I in 2 instances, Pneumococcus ...",10479,"['Science fiction', ' Short stories', ' United...","[[, 'pneumococcus, 2, instances, ,, pneumococc..."
2440,2440,7150,"Blish, James",62429,59415,To Pay the Piper,[' pleurisy │ │ │ │ ...,12329,"['Science fiction', ' Short stories', ' United...","[[, ', pleurisy, │, │, │, │, │, │, │, │, ', ,,..."
2441,2441,7150,"Blish, James",62429,59415,To Pay the Piper,['influenzæ. Repeated throat cultures were not...,14679,"['Science fiction', ' Short stories', ' United...","[[, 'influenzæ, ., repeated, throat, cultures,..."
2442,2442,7150,"Blish, James",62429,59415,To Pay the Piper,"[' and Pneumonia',...",5129,"['Science fiction', ' Short stories', ' United...","[[, ', pneumonia, ', ,, ``, ,, 'the, methods, ..."


In [90]:
# load the preprocessed data from the csv file
data = pd.read_csv('data/preprocessed_data.csv')

# convert all words to lowercase
data['text_tokenized'] = data['text_tokenized'].apply(lambda tokens: [token.lower() for token in tokens])

# save the preprocessed data to a new csv file
data.to_csv('data/preprocessed_data.csv', index=False)
data

Unnamed: 0.1,Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,text_tokenized
0,0,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['', '', '', 'CHAPTER VIII.', '', 'JAMES TRIPL...",3693,"['Science fiction', ' Short stories']","[[, ', [, ', ,, , ', `, `, ', ,, , ', ,, ', ..."
1,1,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['life;"" and her shoulders went up to her ears...",1643,"['Science fiction', ' Short stories']","[[, ', [, ', ,, , "", ', l, i, f, e, "", ,, , ..."
2,2,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['various talents, and are come to make a dema...",4143,"['Science fiction', ' Short stories']","[[, ', [, ', ,, , "", ', v, a, r, i, o, u, s, ..."
3,3,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,"['would be a lasting discredit,"" she continued...",3393,"['Science fiction', ' Short stories']","[[, ', [, ', ,, , "", ', w, o, u, l, d, "", ,, ..."
4,4,5959,"Sharkey, John Michael",3670,33871,Old Friends Are the Best,['homely which he invented in our first chapte...,2693,"['Science fiction', ' Short stories']","[[, ', [, ', ,, , "", ', h, o, m, e, l, y, "", ..."
...,...,...,...,...,...,...,...,...,...,...
2439,2439,7150,"Blish, James",62429,59415,To Pay the Piper,"['Pneumococcus I in 2 instances, Pneumococcus ...",10479,"['Science fiction', ' Short stories', ' United...","[[, ', [, ', ,, , "", ', p, n, e, u, m, o, c, ..."
2440,2440,7150,"Blish, James",62429,59415,To Pay the Piper,[' pleurisy │ │ │ │ ...,12329,"['Science fiction', ' Short stories', ' United...","[[, ', [, ', ,, , "", ', "", ,, , ', p, l, e, ..."
2441,2441,7150,"Blish, James",62429,59415,To Pay the Piper,['influenzæ. Repeated throat cultures were not...,14679,"['Science fiction', ' Short stories', ' United...","[[, ', [, ', ,, , "", ', i, n, f, l, u, e, n, ..."
2442,2442,7150,"Blish, James",62429,59415,To Pay the Piper,"[' and Pneumonia',...",5129,"['Science fiction', ' Short stories', ' United...","[[, ', [, ', ,, , "", ', "", ,, , ', p, n, e, ..."


In [91]:
# import nltk
# import pandas as pd
# import string
#
# # download the stopwords corpus if not already downloaded
# nltk.download('stopwords')
#
# # load the data from csv file
# data = pd.read_csv('data/preprocessed_data.csv')
#
# # create an instance of the English stop words list
# stop_words = nltk.corpus.stopwords.words('english')
#
# # define a function to tokenize, remove stop words, remove punctuation, and convert to lowercase
# def preprocess_text(text):
#     # tokenize the text
#     tokens = nltk.word_tokenize(text)
#     # remove the stop words
#     tokens = [token for token in tokens if token not in stop_words]
#     # remove the punctuation
#     tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
#     # convert to lowercase
#     tokens = [token.lower() for token in tokens]
#     return tokens
#
# # apply the function to the 'text' column of the dataframe
# data['text_tokenized'] = data['text'].apply(preprocess_text)
#
# # save the preprocessed data to a new csv file
# data.to_csv('data/preprocessed_data.csv', index=False)
# data

In [92]:
# import pandas as pd
# import torch
# from torch.utils.data import Dataset, DataLoader
# from torchtext.vocab import GloVe
#
# # load the preprocessed data from the csv file
# data = pd.read_csv('data/preprocessed_data.csv')
#
# # define a custom dataset to convert the tokenized text sequences to PyTorch tensors
# class TextDataset(Dataset):
#     def __init__(self, data, vocab, max_len):
#         self.data = data
#         self.vocab = vocab
#         self.max_len = max_len
#
#     def __len__(self):
#         return len(self.data)
#
#     def __getitem__(self, idx):
#         # get the tokenized text sequence for the current index
#         seq = self.data.iloc[idx]['text_tokenized']
#         # convert the sequence to a list of word indices
#         seq = [self.vocab.stoi.get(token, self.vocab.stoi['<unk>']) for token in seq]
#         # truncate or pad the sequence to the specified max length
#         if len(seq) > self.max_len:
#             seq = seq[:self.max_len]
#         else:
#             seq += [self.vocab.stoi['<pad>']] * (self.max_len - len(seq))
#         # convert the list of word indices to a PyTorch tensor
#         seq = torch.tensor(seq)
#         # return the padded tensor sequence and the corresponding subject
#         return seq, self.data.iloc[idx]['subject']
#
# # download the GloVe embeddings with 100 dimensions
# glove = GloVe(name='6B', dim=100)
# # create a vocabulary with the GloVe embeddings
# vocab = glove.get_vocab()
# # add special tokens for padding and unknown words
# vocab.add_specials({'<pad>': 0, '<unk>': len(vocab)})
# # set the maximum sequence length
# max_len = 50
#
# # create an instance of the custom dataset
# text_dataset = TextDataset(data, vocab, max_len)
#
# # define a dataloader to load the padded text sequences and subjects in batches
# batch_size = 32
# text_dataloader = DataLoader(text_dataset, batch_size=batch_size, shuffle=True)
#
# # iterate over the dataloader to get a batch of padded text sequences and subjects
# for batch_seq, batch_subj in text_dataloader:
#     print(batch_seq)
#     print(batch_seq.shape)
#     print(batch_subj)
#     print(batch_subj.shape)
#     break


In [93]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe, Vocab
from collections import Counter

# load the preprocessed data from the csv file
data = pd.read_csv('data/preprocessed_data.csv')

# define a custom dataset to convert the tokenized text sequences to PyTorch tensors
class TextDataset(Dataset):
    def __init__(self, data, vocab, max_len):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # get the tokenized text sequence for the current index
        seq = self.data.iloc[idx]['text_tokenized']
        # convert the sequence to a list of word indices
        seq = [self.vocab.stoi.get(token, self.vocab.stoi['<unk>']) for token in seq]
        # truncate or pad the sequence to the specified max length
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
        else:
            seq += [self.vocab.stoi['<pad>']] * (self.max_len - len(seq))
        # convert the list of word indices to a PyTorch tensor
        seq = torch.tensor(seq)
        # return the padded tensor sequence and the corresponding subject
        return seq, self.data.iloc[idx]['subject']

# download the GloVe embeddings with 100 dimensions
glove = GloVe(name='6B', dim=100)
# count the word frequencies from the GloVe embeddings
counter = Counter(glove.itos)
# create a vocabulary with the GloVe embeddings
vocab = Vocab(freqs=counter, specials=['<unk>', '<pad>'], max_size=100000, min_freq=5)
# set the vectors for the vocabulary using the GloVe embeddings
vocab.set_vectors(glove.stoi, glove.vectors, glove.dim)
# set the maximum sequence length
max_len = 50

# create an instance of the custom dataset
text_dataset = TextDataset(data, vocab, max_len)

# define a dataloader to load the padded text sequences and subjects in batches
batch_size = 32
text_dataloader = DataLoader(text_dataset, batch_size=batch_size, shuffle=True)

# iterate over the dataloader to get a batch of padded text sequences and subjects
for batch_seq, batch_subj in text_dataloader:
    print(batch_seq)
    print(batch_seq.shape)
    print(batch_subj)
    print(batch_subj.shape)
    break


TypeError: __init__() got an unexpected keyword argument 'freqs'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load the preprocessed data from the csv file
data = pd.read_csv('data/preprocessed_data.csv')

# split the data into train and test sets with a 70:30 ratio
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# split the train data into train and validation sets with a 80:20 ratio
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# save the train, test, and validation data to separate csv files
train_data.to_csv('data/train_data.csv', index=False)
test_data.to_csv('data/test_data.csv', index=False)
val_data.to_csv('data/val_data.csv', index=False)


In [None]:
import pandas as pd
import gensim

# load the preprocessed data from the csv file
data = pd.read_csv('data/preprocessed_data.csv')

# train a word2vec model on the tokenized text sequences
sentences = data['text_tokenized'].tolist()
model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# save the word2vec model to a file
model.save('word2vec.model')


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe, Vocab
from collections import Counter

# load the preprocessed data from the csv file
data = pd.read_csv('data/preprocessed_data.csv')

# define a custom dataset to convert the tokenized text sequences to PyTorch tensors
class TextDataset(Dataset):
    def __init__(self, data, vocab, max_len):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # get the tokenized text sequence for the current index
        seq = self.data.iloc[idx]['text_tokenized']
        # convert the sequence to a list of word indices
        seq = [self.vocab.stoi.get(token, self.vocab.stoi['<unk>']) for token in seq]
        # truncate or pad the sequence to the specified max length
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
        else:
            seq += [self.vocab.stoi['<pad>']] * (self.max_len - len(seq))
        # convert the list of word indices to a PyTorch tensor
        seq = torch.tensor(seq)
        # return the padded tensor sequence and the corresponding subject
        return seq, self.data.iloc[idx]['subject']

# download the GloVe embeddings with 100 dimensions
glove = GloVe(name='6B', dim=100)
# count the word frequencies from the GloVe embeddings
counter = Counter(glove.itos)
# create a vocabulary with the word frequencies from the dataset
vocab = Vocab(counter=counter, specials=['<unk>', '<pad>'], max_size=100000, min_freq=5)
# add the GloVe vocabulary to the dataset vocabulary
vocab.extend(glove.itos)
# set the vectors for the vocabulary using the GloVe embeddings
vocab.set_vectors(glove.stoi, glove.vectors, glove.dim)
# set the maximum sequence length
max_len = 50

# create an instance of the custom dataset
text_dataset = TextDataset(data, vocab, max_len)

# define a dataloader to load the padded text sequences and subjects in batches
batch_size = 32
text_dataloader = DataLoader(text_dataset, batch_size=batch_size, shuffle=True)

# iterate over the dataloader to get a batch of padded text sequences and subjects
for batch_seq, batch_subj in text_dataloader:
    print(batch_seq)
    print(batch_seq.shape)
    print(batch_subj)
    print(batch_subj.shape)
    break


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data into a Pandas DataFrame
data = pd.read_csv('data/data_w_subj.csv',index_col=0)

# Split the data into train, validation, and test sets
train, test_valid = train_test_split(data, test_size=0.4, random_state=42)
valid, test = train_test_split(test_valid, test_size=0.5, random_state=42)

# Save the splits into separate CSV files
train.to_csv('data/train.csv', index=False)
valid.to_csv('data/valid.csv', index=False)
test.to_csv('data/test.csv', index=False)


In [33]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Load the data into a Pandas DataFrame
data = pd.read_csv('data/train.csv')

# Tokenize the text
data['tokens'] = data['text'].apply(lambda x: word_tokenize(x.lower()))

# Save the tokenized data into a new CSV file
data.to_csv('data/train_tokenized.csv', index=False)

# Load the test data into a Pandas DataFrame
test_data = pd.read_csv('data/test.csv')

# Tokenize the test data
test_data['tokens'] = test_data['text'].apply(lambda x: word_tokenize(x.lower()))

# Save the tokenized test data into a new CSV file
test_data.to_csv('data/test_tokenized.csv', index=False)

# Load the validation data into a Pandas DataFrame
val_data = pd.read_csv('data/valid.csv')

# Tokenize the validation data
val_data['tokens'] = val_data['text'].apply(lambda x: word_tokenize(x.lower()))

# Save the tokenized validation data into a new CSV file
val_data.to_csv('data/valid_tokenized.csv', index=False)
test_data

Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,tokens
0,13126,"Moore, Clement Clarke",10476,68631,A visit from St. Nicholas,"['of his disappearance.', '', '""But this is no...",7513,"['Santa Claus -- Juvenile poetry', ' Christmas...","[[, 'of, his, disappearance, ., ', ,, ``, ,, '..."
1,16267,"Budge, E. A. Wallis (Ernest Alfred Wallis), Sir",57905,11277,Egyptian Ideas of the Future Life,['company I met a muleteer going into the town...,2079,"['Egypt -- Religion', ' Future life', ' Eschat...","[[, 'company, i, met, a, muleteer, going, into..."
2,8505,"Coleridge, Christabel Rose",42476,43150,The Constant Prince,"['handsome and cultured lady, sought in marria...",4607,"[""Children's stories"", ' Princes -- Juvenile f...","[[, 'handsome, and, cultured, lady, ,, sought,..."
3,19339,"Hood, Thomas",38605,59269,The Works of Thomas Hood; Vol. 01 (of 11)\r\nC...,"['hogy még egyszer eljön, de ha akkor sem ébre...",2471,"['English wit and humor', ' Humorous poetry, E...","[[, 'hogy, még, egyszer, eljön, ,, de, ha, akk..."
4,13126,"Moore, Clement Clarke",52922,17382,A Visit From Saint Nicholas,"['Kun Laila vasta iltapäivällä palasi kotia, e...",3337,"['Santa Claus -- Juvenile poetry', ' Christmas...","[[, 'kun, laila, vasta, iltapäivällä, palasi, ..."
...,...,...,...,...,...,...,...,...,...
484,11387,"Sanger, Margaret Higgins",58798,61303,Magnetation Methods of Birth Control,"['', 'Dolan leaned back against the machine an...",435,"['Birth control', ' Contraception']","[[, ``, ,, 'dolan, leaned, back, against, the,..."
485,12996,"Newell, P. S. (Peter S.)",65505,67144,The Hole Book,"[' century.', ' Magnus Tieffenbrucker, Ven...",4121,"['Stories in rhyme', ' Humorous poetry', ' Wit...","[[, ', century, ., ', ,, ', magnus, tieffenbru..."
486,12996,"Newell, P. S. (Peter S.)",10347,26271,The Slant Book,"[""or else the snakes won't eat them.) It was e...",1705,"['Series([], )']","[[, ``, or, else, the, snakes, wo, n't, eat, t..."
487,4051,"Chateaubriand, François Auguste René, vicomte de",39033,54788,The Memoirs of François René Vicomte de Chatea...,['In making calculations it is generally more ...,727,"['Chateaubriand, François-René, vicomte de, 17...","[[, 'in, making, calculations, it, is, general..."


In [15]:
# import nltk
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeffereyreng/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load the tokenized data into a Pandas DataFrame
train_data = pd.read_csv('data/train_tokenized.csv')

# Define the stop words to remove
stop_words = set(stopwords.words('english'))

# Remove the stop words from the tokenized text
train_data['tokens'] = train_data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Save the cleaned data into a new CSV file
train_data.to_csv('data/train_cleaned.csv', index=False)
train_data

Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,tokens
0,3062,"Pollock, Frank L. (Frank Lillie)",13292,67678,The Glacier Gate: An Adventure Story,"['The dead men were laid out in a row, on thei...",4895,"['Physicians -- Fiction', ' Adventure stories']","[[, ', [, ', ,, , "", ', h, e, "", ,, , ', e, ..."
1,3554,"Parrott, J. Edward",2000,35314,"The Childrens' Story of the War, Volume 2 (of ...","['acabaron los honrados discursos, y adonde se...",13335,"['World War, 1914-1918 -- Juvenile literature']","[[, ', [, ', ,, , "", ', c, b, r, n, "", ,, , ..."
2,10516,"Whitney, Adeline Dutton Train",19626,45301,Mother Goose for Grown Folks,"['# The Butcher and his Customers - 00:01:27',...",26,['Nursery rhymes -- Poetry'],"[[, ', [, ', ,, , "", ', "", ,, , ', #, ', ,, ..."
3,11463,"De Retz, Jean Francois Paul de Gondi",41438,7564,Quotes and Images From Memoirs of Cardinal De ...,"['""My mistress gave them to me,"" answered Osra...",5451,['Quotations'],"[[, ', [, ', ,, , "", ', "", ,, , "", ', ', "", ..."
4,8505,"Coleridge, Christabel Rose",64024,43121,Amethyst: The Story of a Beauty,[' At any time were born: next in what way'...,2297,['England -- Social life and customs -- 19th c...,"[[, ', [, ', ,, , "", ', "", ,, , ', ', ,, , ..."
...,...,...,...,...,...,...,...,...,...
1461,1345,"Poe, Edgar Allan",17254,6557,The Fall of the House of Usher,"['', 'Perambulating down the street', ""Was Mis...",210,"['Series([], )']","[[, ', [, ', ,, , ', `, `, ', ,, , ', ,, ', ..."
1462,11387,"Sanger, Margaret Higgins",40223,52888,What Every Girl Should Know,"['attaining his majority, the Lord of Arden, w...",14305,['Women social reformers -- United States -- B...,"[[, ', [, ', ,, , "", ', n, n, g, "", ,, , ', ..."
1463,4395,"Robins, Elizabeth",2532,10038,The Magnetic North,"['her alone, and not be over-anxious, for ever...",86,['Yukon -- Fiction'],"[[, ', [, ', ,, , "", ', h, e, r, "", ,, , ', ..."
1464,20265,"Bichat, X. (Xavier)",18896,56147,"General Anatomy, Applied to Physiology and Med...","['', '', '', '', 'CHAPTER XI.', '', 'CROSS COR...",3319,"['Human physiology', ' Human anatomy']","[[, ', [, ', ,, , ', `, `, ', ,, , ', ,, ', ..."


In [35]:
# Load the tokenized test data into a Pandas DataFrame
test_data = pd.read_csv('data/test_tokenized.csv')

# Define the stop words to remove
stop_words = set(stopwords.words('english'))

# Remove the stop words from the tokenized text
test_data['tokens'] = test_data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Save the cleaned data into a new CSV file
test_data.to_csv('data/test_cleaned.csv', index=False)
test_data

Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,tokens
0,13126,"Moore, Clement Clarke",10476,68631,A visit from St. Nicholas,"['of his disappearance.', '', '""But this is no...",7513,"['Santa Claus -- Juvenile poetry', ' Christmas...","[[, ', [, ', ,, , "", ', f, "", ,, , ', h, ', ..."
1,16267,"Budge, E. A. Wallis (Ernest Alfred Wallis), Sir",57905,11277,Egyptian Ideas of the Future Life,['company I met a muleteer going into the town...,2079,"['Egypt -- Religion', ' Future life', ' Eschat...","[[, ', [, ', ,, , "", ', c, p, n, "", ,, , ', ..."
2,8505,"Coleridge, Christabel Rose",42476,43150,The Constant Prince,"['handsome and cultured lady, sought in marria...",4607,"[""Children's stories"", ' Princes -- Juvenile f...","[[, ', [, ', ,, , "", ', h, n, e, "", ,, , ', ..."
3,19339,"Hood, Thomas",38605,59269,The Works of Thomas Hood; Vol. 01 (of 11)\r\nC...,"['hogy még egyszer eljön, de ha akkor sem ébre...",2471,"['English wit and humor', ' Humorous poetry, E...","[[, ', [, ', ,, , "", ', h, g, "", ,, , ', é, ..."
4,13126,"Moore, Clement Clarke",52922,17382,A Visit From Saint Nicholas,"['Kun Laila vasta iltapäivällä palasi kotia, e...",3337,"['Santa Claus -- Juvenile poetry', ' Christmas...","[[, ', [, ', ,, , "", ', k, u, n, "", ,, , ', ..."
...,...,...,...,...,...,...,...,...,...
484,11387,"Sanger, Margaret Higgins",58798,61303,Magnetation Methods of Birth Control,"['', 'Dolan leaned back against the machine an...",435,"['Birth control', ' Contraception']","[[, ', [, ', ,, , ', `, `, ', ,, , ', ,, ', ..."
485,12996,"Newell, P. S. (Peter S.)",65505,67144,The Hole Book,"[' century.', ' Magnus Tieffenbrucker, Ven...",4121,"['Stories in rhyme', ' Humorous poetry', ' Wit...","[[, ', [, ', ,, , "", ', "", ,, , ', c, e, n, ..."
486,12996,"Newell, P. S. (Peter S.)",10347,26271,The Slant Book,"[""or else the snakes won't eat them.) It was e...",1705,"['Series([], )']","[[, ', [, ', ,, , ', `, `, ', ,, , ', r, ', ..."
487,4051,"Chateaubriand, François Auguste René, vicomte de",39033,54788,The Memoirs of François René Vicomte de Chatea...,['In making calculations it is generally more ...,727,"['Chateaubriand, François-René, vicomte de, 17...","[[, ', [, ', ,, , "", ', n, "", ,, , ', k, n, ..."


In [36]:
# Load the tokenized validation data into a Pandas DataFrame
val_data = pd.read_csv('data/valid_tokenized.csv')

# Define the stop words to remove
stop_words = set(stopwords.words('english'))

# Remove the stop words from the tokenized text
val_data['tokens'] = val_data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Save the cleaned data into a new CSV file
val_data.to_csv('data/valid_cleaned.csv', index=False)
val_data

Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,tokens
0,12243,"Connor, Ralph",61269,3466,The Foreigner: A Tale of Saskatchewan,"[' Paul, past the islet of Comino, with its s...",1293,"['Canada -- Fiction', ' Saskatchewan -- Fiction']","[[, ', [, ', ,, , "", ', "", ,, , ', p, u, l, ..."
1,3062,"Pollock, Frank L. (Frank Lillie)",45104,67627,The Treasure Trail,"['It took me a little time to dress, but I got...",677,"['Adventure stories', ' Canadian fiction']","[[, ', [, ', ,, , "", ', "", ,, , ', k, ', ,, ..."
2,8528,"Dahn, Felix",64539,32377,"A Struggle for Rome, v. 3","['Azután kialszik a tüz. Még ráértem, hogy bát...",2216,"['Rome -- History -- Empire, 284-476 -- Fictio...","[[, ', [, ', ,, , "", ', z, u, á, n, "", ,, , ..."
3,13126,"Moore, Clement Clarke",52922,17382,A Visit From Saint Nicholas,"['""Olenpahan tässä lueskellut ja kirjoittanutk...",2237,"['Santa Claus -- Juvenile poetry', ' Christmas...","[[, ', [, ', ,, , "", ', "", ,, , "", ', ', "", ..."
4,7150,"Blish, James",4665,22958,One-Shot,"['', 'ZEPHERINA:', 'Take off your big boots.',...",581,"['Science fiction', ' Short stories']","[[, ', [, ', ,, , ', `, `, ', ,, , ', ,, ', ..."
...,...,...,...,...,...,...,...,...,...
484,4574,"Whishaw, Frederick J.",22818,56522,Mazeppa,"[' The feelings of =F=enelon, =F=aber, and =F...",89,"['Cossacks -- Fiction', ' Mazepa, Ivan Stepano...","[[, ', [, ', ,, , "", ', "", ,, , ', h, e, ', ..."
485,2433,"Aubigné, J. H. Merle d' (Jean Henri Merle)",68401,60152,History of the Reformation in Europe in the Ti...,"['den bekannten Vorwurf: die Mutter ruft, um e...",2455,['Reformation'],"[[, ', [, ', ,, , "", ', e, n, "", ,, , ', b, ..."
486,4395,"Robins, Elizabeth",10713,26420,The Convert,['the information which Mr. Lloyd George had r...,1039,"['Feminist fiction', ' Women -- Fiction', ' Su...","[[, ', [, ', ,, , "", ', h, e, "", ,, , ', n, ..."
487,4395,"Robins, Elizabeth",59033,61932,Come and Find Me,"['', '', 'ST NEOT AND THE FISHES.', '', 'On on...",2417,"['Alaska -- Fiction', ' Gold mines and mining ...","[[, ', [, ', ,, , ', `, `, ', ,, , ', ,, ', ..."


In [37]:
import pandas as pd

# Load the cleaned data into a Pandas DataFrame
train_data = pd.read_csv('data/train_cleaned.csv')
# Load the cleaned test data into a Pandas DataFrame
test_data = pd.read_csv('data/test_cleaned.csv')
# Load the cleaned validation data into a Pandas DataFrame
val_data = pd.read_csv('data/valid_cleaned.csv')

# Convert all words to lowercase
train_data['tokens'] = train_data['tokens'].apply(lambda x: [word.lower() for word in x])
# Convert all words to lowercase
test_data['tokens'] = test_data['tokens'].apply(lambda x: [word.lower() for word in x])
# Convert all words to lowercase
val_data['tokens'] = val_data['tokens'].apply(lambda x: [word.lower() for word in x])

# Save the preprocessed data into a new CSV file
train_data.to_csv('data/train_preprocessed.csv', index=False)
# Save the preprocessed test data into a new CSV file
test_data.to_csv('data/test_preprocessed.csv', index=False)
# Save the preprocessed validation data into a new CSV file
val_data.to_csv('data/valid_preprocessed.csv', index=False)
train_data

Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,tokens
0,3062,"Pollock, Frank L. (Frank Lillie)",13292,67678,The Glacier Gate: An Adventure Story,"['The dead men were laid out in a row, on thei...",4895,"['Physicians -- Fiction', ' Adventure stories']","[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
1,3554,"Parrott, J. Edward",2000,35314,"The Childrens' Story of the War, Volume 2 (of ...","['acabaron los honrados discursos, y adonde se...",13335,"['World War, 1914-1918 -- Juvenile literature']","[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
2,10516,"Whitney, Adeline Dutton Train",19626,45301,Mother Goose for Grown Folks,"['# The Butcher and his Customers - 00:01:27',...",26,['Nursery rhymes -- Poetry'],"[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
3,11463,"De Retz, Jean Francois Paul de Gondi",41438,7564,Quotes and Images From Memoirs of Cardinal De ...,"['""My mistress gave them to me,"" answered Osra...",5451,['Quotations'],"[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
4,8505,"Coleridge, Christabel Rose",64024,43121,Amethyst: The Story of a Beauty,[' At any time were born: next in what way'...,2297,['England -- Social life and customs -- 19th c...,"[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
...,...,...,...,...,...,...,...,...,...
1461,1345,"Poe, Edgar Allan",17254,6557,The Fall of the House of Usher,"['', 'Perambulating down the street', ""Was Mis...",210,"['Series([], )']","[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
1462,11387,"Sanger, Margaret Higgins",40223,52888,What Every Girl Should Know,"['attaining his majority, the Lord of Arden, w...",14305,['Women social reformers -- United States -- B...,"[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
1463,4395,"Robins, Elizabeth",2532,10038,The Magnetic North,"['her alone, and not be over-anxious, for ever...",86,['Yukon -- Fiction'],"[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."
1464,20265,"Bichat, X. (Xavier)",18896,56147,"General Anatomy, Applied to Physiology and Med...","['', '', '', '', 'CHAPTER XI.', '', 'CROSS COR...",3319,"['Human physiology', ' Human anatomy']","[[, ', [, ', ,, , "", ', "", ,, , ', [, ', ,, ..."


In [38]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Load the preprocessed data into a Pandas DataFrame
train_data = pd.read_csv('data/train_preprocessed.csv')

# Convert the tokens to sequences of word indices
word_index = {'<PAD>': 0}  # Initialize the word index with a padding token
sequences = []
for tokens in train_data['tokens']:
    sequence = []
    for token in tokens:
        if token not in word_index:
            word_index[token] = len(word_index)
        sequence.append(word_index[token])
    sequences.append(sequence)

# Pad the sequences with zeros
maxlen = max(len(sequence) for sequence in sequences)
padded_sequences = torch.zeros((len(sequences), maxlen), dtype=torch.long)
for i, sequence in enumerate(sequences):
    padded_sequences[i, :len(sequence)] = torch.tensor(sequence)

# Add the padded sequences to the DataFrame
train_data['padded_sequences'] = padded_sequences.tolist()

# Define a PyTorch dataset to hold the padded sequences and labels
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.sequences = torch.tensor(data['padded_sequences'].tolist())
        self.labels = torch.tensor(data['label'].tolist())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.sequences[index], self.labels[index]

# Create a PyTorch DataLoader for the training dataset
train_dataset = TextDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Save the padded sequences into a new CSV file
train_data.to_csv('data/train_padded.csv', index=False)


KeyError: 'label'

In [78]:
import pandas as pd
import nltk
from gensim.models import Word2Vec

# Step 1: Tokenize the text
df = pd.read_csv('data/data_w_subj.csv')
nltk.download('punkt')
df['tokens'] = df['text'].apply(nltk.word_tokenize)

# Step 2: Remove stop words
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Step 3: Convert words to lowercase
df['tokens'] = df['tokens'].apply(lambda x: [word.lower() for word in x])

# Step 4: Pad sequences
# Gensim's Word2Vec model does not require fixed-length sequences, so this step is not necessary

# Step 5: Build the word embedding model
model = Word2Vec(sentences=df['tokens'], vector_size=100, min_count=1, window=5, workers=4)

# Step 6: Train the model

# Step 7: Evaluate the model



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeffereyreng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeffereyreng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
