In [None]:
#  Autoreload changes
%load_ext autoreload
%autoreload 2

from fuzzywuzzy import process
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from utils.book_quotes_utils import extract_book_quotes_from_json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def get_best_match(title, choices):
    best_match, score = process.extractOne(title, choices)
    return best_match,score

In [None]:
RAW_DIRECTORY = os.path.join(os.path.abspath(''),"0-RawData")

df = pd.read_csv(RAW_DIRECTORY+'/goodreads_library_export.csv')
read_df = df[df['Read Count']>=1].dropna(axis=1,how='all')
print(read_df.columns)


Index(['Book Id', 'Title', 'Author', 'Author l-f', 'Additional Authors',
       'ISBN', 'ISBN13', 'My Rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Read Count',
       'Owned Copies'],
      dtype='object')


## Matching KeepNotes with Goodreads data

In [12]:
book_data = extract_book_quotes_from_json(directory=RAW_DIRECTORY+'/2-BookQuotes')

In [None]:

df_titles = read_df['Title'].tolist()
for ind, title_dict in enumerate(book_data):
    title = title_dict['title']
    best_match,score = get_best_match(title, df_titles)
    if score<=86:
        print(f"{ind} {score}- {title[:50]} \t -----  {best_match}")
        

#### Once corrections have been made

In [14]:
df_titles = read_df['Title'].tolist()
rows = []

for title_dict in book_data:
    title = title_dict['title']
    quote = title_dict['quotes']  # Assuming the quote key is 'quote'
    
    best_match, score = get_best_match(title, df_titles)
    
    # Skip those with low score
    if score<=86:
        continue
    rows.append({'Title': best_match, 'Quotes': quote})

# Create a new DataFrame
quotes_df = pd.DataFrame(rows)

read_df = pd.merge(read_df, quotes_df, on='Title', how='left')
# With the "exploded" format, each title and quote will have its own row, which often makes analysis and further processing easier.


In [37]:
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('stopwords')


# Step 1: Tokenization and Removing Stopwords
# Combine stopwords from all specified languages
stop_words = set()
languages = ['english', 'spanish', 'portuguese', 'french']  # Add or remove languages as needed

for lang in languages:
    stop_words.update(stopwords.words(lang))


def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]  # Removing punctuation
    tokens = [word for word in tokens if word not in stop_words]  # Removing stopwords
    return tokens

# Prepare all quotes
quotes = []
for row in read_df['Quotes']:
    if isinstance(row,list) and row:
        quotes+=row

processed_quotes = [preprocess_text(quote) for quote in quotes]

# Step 2: Building the Dictionary and Corpus
dictionary = corpora.Dictionary(processed_quotes)
corpus = [dictionary.doc2bow(text) for text in processed_quotes]

# Step 3: Applying LDA (Latent Dirichlet Allocation) for Topic Modeling
lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)


(0, '0.008*"people" + 0.005*"world" + 0.004*"one" + 0.004*"us"')
(1, '0.005*"one" + 0.003*"must" + 0.003*"world" + 0.003*"time"')
(2, '0.009*"one" + 0.004*"brain" + 0.004*"world" + 0.003*"people"')


In [32]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)


In [35]:
# Define a function to export the top terms for a given topic
def export_top_terms(model, topic_id, num_terms=30, filepath='topic_terms.txt'):
    """
    Exports the top terms for a given topic to a text file.
    
    :param model: Trained LDA model
    :param topic_id: ID of the topic for which terms should be exported
    :param num_terms: Number of top terms to export
    :param filepath: Path to the output text file
    """
    # Get the terms for the given topic
    terms = model.show_topic(topic_id, num_terms)
    
    # Open the file for writing
    with open(filepath, 'w') as file:
        # Write each term and its weight to the file
        for term, weight in terms:
            file.write(f'{term}: {weight}\n')

# Usage
export_top_terms(lda_model, topic_id=0)  # topic_id is zero-indexed
