In [1]:
import pandas as pd
import os
from collections import Counter, defaultdict
import json
import itertools

In [20]:
import numpy as np

In [2]:
# load config
with open('config.json', 'r') as f:
    config = json.load(f)
cwd = os.getcwd()
os.chdir(config['REPODIR'])
import Utils as U
from Corpus import Corpus
os.chdir(cwd)

  from .autonotebook import tqdm as notebook_tqdm


In [61]:
data = U.load_file('data_w_subj_new.csv', 'csv', config['DATADIR'])
data.groupby('subject').agg('count')


Unnamed: 0_level_0,Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Biography,268,268,268,268,268,268,268,268,268
Conduct of life,441,441,441,441,441,441,441,441,441
Description and travel,700,700,700,700,700,700,700,700,700
England,581,581,581,581,581,581,581,581,581
Fiction,4710,4710,4710,4710,4710,4710,4710,4710,4710
Great Britain,763,763,763,763,763,763,763,763,763
History,1041,1041,1041,1041,1041,1041,1041,1041,1041
Juvenile fiction,1663,1663,1663,1663,1663,1663,1663,1663,1663
Other,13057,13057,13057,13057,13057,13057,13057,13057,13057
Social life and customs,166,166,166,166,166,166,166,166,166


In [62]:
subjects = data['subject'].unique()

In [64]:
n_subjects = len(subjects)

In [65]:
import re

# Remove punctuation
data['text_processed'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert to lowercase
data['text_processed'] = data['text_processed'].apply(lambda x: x.lower())

data.head()

Unnamed: 0.1,Unnamed: 0,author_id,author_name,book_id,gutenbergbookid,title,text,text_lines,subjects,subject,text_processed
0,0,8824,"Dixon, Thomas F.",17427,18721,The Victim: A Romance of the Real Jefferson Davis,['us are very disappointing. No doubt you have...,5545,"['United States', 'History', 'Civil War, 1861-...",History,us are very disappointing no doubt you have fo...
1,1,8824,"Dixon, Thomas F.",17427,18721,The Victim: A Romance of the Real Jefferson Davis,['been running along parallels of latitude wil...,2345,"['United States', 'History', 'Civil War, 1861-...",History,been running along parallels of latitude will ...
2,2,8824,"Dixon, Thomas F.",17427,18721,The Victim: A Romance of the Real Jefferson Davis,['And then something happened. A great questio...,7295,"['United States', 'History', 'Civil War, 1861-...",History,and then something happened a great question a...
3,3,8824,"Dixon, Thomas F.",17427,18721,The Victim: A Romance of the Real Jefferson Davis,['and excite us to the display of the best pow...,6945,"['United States', 'History', 'Civil War, 1861-...",History,and excite us to the display of the best power...
4,4,8824,"Dixon, Thomas F.",17427,18721,The Victim: A Romance of the Real Jefferson Davis,['which I have spoken if we have not the ships...,4695,"['United States', 'History', 'Civil War, 1861-...",History,which i have spoken if we have not the ships h...


In [None]:
# from wordcloud import WordCloud

# # Join together
# long_string = ','.join(list(data['text_processed'].values))

# # Generate a wordCloud
# wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# wordcloud.generate(long_string)

# # Visualize
# wordcloud.to_image()

In [66]:
# Get number of unique subjects
num_subjects = data['subjects'].nunique()
num_subjects = n_subjects

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score



# Split data into train/validation/test sets with a ratio of 70/15/15
train, test = train_test_split(data, test_size=0.3, random_state=42)
val, test = train_test_split(test, test_size=0.5, random_state=42)

# Preprocess text data - remove stop_words
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train['text_processed'])
X_val = vectorizer.transform(val['text_processed'])
X_test = vectorizer.transform(test['text_processed'])

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, stop_words="english"
)

# Train LDA model
lda = LatentDirichletAllocation(n_components=n_subjects) 
                                #,max_iter=50, learning_method='online', random_state=42
lda.fit(X_train)

# Predict subject for documents in validation set
y_val = lda.transform(X_val)

# Predict subject for documents in test set
y_test = lda.transform(X_test)

# Convert subject column to numerical labels
labels_train = pd.factorize(train['subject'])
labels_val = pd.factorize(val['subject'])
labels_test = pd.factorize(test['subject'])


# This is probably a better way of getting the classifier
# accuracy_score(labels_val[0], np.apply_along_axis(np.argmax, 1, y_val))


In [69]:
y_train = lda.transform(X_train)

In [70]:
y_index_train = np.apply_along_axis(np.argmax, 1, y_train)
y_index_val = np.apply_along_axis(np.argmax, 1, y_val)

y_lda_topics_train = [subjects[idx] for idx in y_index_train]
y_lda_topics_val = [subjects[idx] for idx in y_index_val]

In [71]:
y_lda_topics_train
y_lda_topics_val

['History',
 'Great Britain',
 'Other',
 'Other',
 'History',
 'Other',
 'England',
 'History',
 'Juvenile fiction',
 'Other',
 'History',
 'Other',
 'History',
 'Other',
 'History',
 'Other',
 'History',
 'Fiction',
 'Other',
 'History',
 'England',
 'History',
 'Other',
 'History',
 'Other',
 'Great Britain',
 'Other',
 'History',
 'Other',
 'Juvenile fiction',
 'Other',
 'Other',
 'Other',
 'History',
 'Other',
 'Juvenile fiction',
 'Biography',
 'History',
 'History',
 'Description and travel',
 'Other',
 'History',
 'England',
 'History',
 'Other',
 'History',
 'History',
 'History',
 'History',
 'Other',
 'History',
 'History',
 'Fiction',
 'History',
 'History',
 'History',
 'History',
 'History',
 'History',
 'Great Britain',
 'History',
 'History',
 'Other',
 'United States',
 'History',
 'Juvenile fiction',
 'Juvenile fiction',
 'England',
 'History',
 'History',
 'History',
 'Other',
 'History',
 'United States',
 'History',
 'History',
 'England',
 'History',
 'Juvenile fic

In [73]:


np.mean(train['subject'] == y_lda_topics_train)

0.22084112713449586

In [75]:
labels_val

(array([0, 1, 2, ..., 0, 4, 4]),
 Index(['Other', 'Description and travel', 'Great Britain', 'Juvenile fiction',
        'Fiction', 'Biography', 'Conduct of life', 'Social life and customs',
        'England', 'History', 'United States'],
       dtype='object'))

In [77]:
pred_val

array([0, 0, 0, ..., 0, 0, 0])

In [76]:



# Train a classifier on the LDA topics
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(y_val, labels_val[0])

# Evaluate performance on the validation set
pred_val = clf.predict(y_val)
acc_val = accuracy_score(labels_val[0], pred_val)
print('Accuracy on validation set:', acc_val)

# Evaluate performance on the test set
pred_test = clf.predict(y_test)
acc_test = accuracy_score(labels_test, pred_test)
print('Accuracy on test set:', acc_test)

ValueError: Found input variables with inconsistent numbers of samples: [2, 3551]