### Imports

In [None]:
import pandas as pd 
import nltk
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from IPython.display import Image
from ipywidgets import interact, interactive


In [4]:
# for the first time use of nltk download all packages after executing:
#nltk.download()

## Data Preprocessing

In [5]:
df = pd.read_json('book_reviews_3_years.json')

In [None]:
# Remove the 'votes' string and commas, then convert to integer
df['Votes'] = df['Votes'].str.replace(' votes', '').str.replace(',', '').astype(int)

# Find rows with NaN
df[df.isna().any(axis=1)]

# Drop rows that has at least two NaN in it.
df = df.dropna()

# Remove the 'ratings' string and commas, then convert to integer
df['Ratings'] = df['Ratings'].str.replace(' ratings', '')	.str.replace(',', '').astype(int)

# Remove duplicate reviews
df['Reviews'] = df['Reviews'].apply(lambda reviews: list(dict.fromkeys(reviews)))

# Identify rows where the Reviews column is an empty array
rows_to_drop = df[df['Reviews'].apply(lambda x: isinstance(x, list) and len(x) == 0)].index

# Drop these rows
df_cleaned = df.drop(rows_to_drop)

# Save the cleaned DataFrame to a JSON file
json_file_path = r'C:\Users\filepath\cleaned_reviews.json'
df_cleaned.to_json(json_file_path, orient='records', lines=True)

In [None]:
data_reviews = pd.read_json("cleaned_reviews.json", lines=True)
data_reviews.head()

Unnamed: 0,Year,Genre,Votes,Title,Author,Overall Rating,Ratings,Genres,Reviews
0,2023,Fiction,200722,Yellowface,R.F. Kuang,3.78,508635,"[Fiction, Contemporary, Audiobook, Literary Fi...","[Fell flat for me, as it felt more like a funh..."
1,2023,Fiction,60171,Hello Beautiful,Ann Napolitano,4.17,335408,"[Fiction, Historical Fiction, Audiobook, Roman...","[After Dear Edward, Ann Napolitano writes a be..."
2,2023,Fiction,57702,The Wishing Game,Meg Shaffer,4.09,123567,"[Fiction, Fantasy, Romance, Contemporary, Magi...",[My Reviews Can Also Be Found On:\nTwitter - A...
3,2023,Fiction,53470,Tom Lake,Ann Patchett,4.01,291199,"[Fiction, Audiobook, Literary Fiction, Romance...","[What a tedious read, felt like the uninterest..."
4,2023,Fiction,45859,The Five-Star Weekend,Elin Hilderbrand,4.06,186108,"[Fiction, Romance, Audiobook, Chick Lit, Conte...",[I know its January ❄️ and we're in the middle...


In [None]:
def prepare_text(text):
    #preprocessing the text, starting with tokenizing it

    characters= ",`'.-:/–=;" + '“’”(){[]}!,?…|'
    translate= {}
    for c in characters:
        translate[ord(c)] = ord(" ")
    text = text.translate(str.maketrans(translate))
    text
    tokens = word_tokenize(str(text).lower())
 
    #removing stopwords from the tokens as they add very little to the sentiment of a text
    filtered_tokens= [token for token in tokens if token not in stopwords.words("english")]

    #Lemmatizing the filtered tokens - words get grouped together based on their origin/meaning
    lemmatizer = WordNetLemmatizer()

    lemmatized_tokens= [lemmatizer.lemmatize(token) for token in filtered_tokens]

    #put the text together into a string again to analyze it later
    processed_text= " ".join(lemmatized_tokens)

    return processed_text

In [None]:
#Since the reviews are stored as strings in an array we have to apply the prepare_text function on every element in that array. For this we use apply to apply a function to every element in the column (the arrays) and then apply prepare_text individually to each review. After that an array of processed strings gets saved in the reviews_processed column

data_reviews["reviews_processed"]= data_reviews["Reviews"].apply(lambda arr: [prepare_text(text) for text in arr])

In [None]:
def add_sentiment(dataframe, column):
    analyzer= SentimentIntensityAnalyzer()

    #add a column with an array of compound values, which is a value given by the analyzer. It gives values for how negative, positive and neutral the text is and combines them to a compound value
    dataframe[f"compound values"] = [[analyzer.polarity_scores(text)["compound"] for text in arr] for arr in dataframe[column]]

    #calculate mean value of the compound values
    dataframe["mean sentiment value"] = [np.mean(np.array(dataframe["compound values"][row])) for row in range(len(dataframe))]


    #If the mean compound value is above 0.05 the text is classified as positive, if its below -0.05 its classified as negative.
    dataframe["sentiment"]="neutral"
    dataframe.loc[dataframe["mean sentiment value"]>0.05, "sentiment"]="positive"
    dataframe.loc[dataframe["mean sentiment value"]<-0.05, "sentiment"]="negative"
    pass    

In [None]:
add_sentiment(data_reviews, "reviews_processed")
data_reviews.to_json("sentiment_analysis_final_1.json")

In [None]:
data= pd.read_json("cleaned_text.json")
data

Unnamed: 0,Year,Genre,Votes,Title,Author,Overall Rating,Ratings,Genres,Reviews,reviews_processed,compound values,mean sentiment value,sentiment,cleaned_text
0,2023,Fiction,200722,Yellowface,R.F. Kuang,3.78,508635,"[Fiction, Contemporary, Audiobook, Literary Fi...","[Fell flat for me, as it felt more like a funh...",[fell flat felt like funhouse mirror depiction...,"[0.7322000000000001, 0.3694, -0.8496, -0.2263,...",0.374251,positive,[fell flat felt like funhouse mirror depiction...
1,2023,Fiction,60171,Hello Beautiful,Ann Napolitano,4.17,335408,"[Fiction, Historical Fiction, Audiobook, Roman...","[After Dear Edward, Ann Napolitano writes a be...",[dear edward ann napolitano writes beautifully...,"[0.9979, -0.8481000000000001, 0.9757, -0.2228,...",0.659471,positive,[dear edward ann napolitano writes beautifully...
2,2023,Fiction,57702,The Wishing Game,Meg Shaffer,4.09,123567,"[Fiction, Fantasy, Romance, Contemporary, Magi...",[My Reviews Can Also Be Found On:\nTwitter - A...,[review also found twitter amazon storygraph b...,"[0.9935, 0.9746, -0.40190000000000003, 0.9899,...",0.751900,positive,[reviews also found : twitter amazon storygrap...
3,2023,Fiction,53470,Tom Lake,Ann Patchett,4.01,291199,"[Fiction, Audiobook, Literary Fiction, Romance...","[What a tedious read, felt like the uninterest...",[tedious read felt like uninteresting rambling...,"[0.5859, 0.9438000000000001, -0.0258, 0.9969, ...",0.775126,positive,[tedious read felt like uninteresting rambling...
4,2023,Fiction,45859,The Five-Star Weekend,Elin Hilderbrand,4.06,186108,"[Fiction, Romance, Audiobook, Chick Lit, Conte...",[I know its January ❄️ and we're in the middle...,[know january ❄️ middle massive snowstorm fun ...,"[0.975, 0.0, 0.9386, 0.9969, -0.31820000000000...",0.701427,positive,[know january ❄️ middle massive snowstorm fun ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,2021,Middle Grade & Children's,1900,Unplugged,Gordon Korman,3.95,5567,"[Middle Grade, Realistic Fiction, Fiction, Mys...",[A bit predictable but a quick and sometimes f...,"[bit predictable quick sometimes fun read, sol...","[0.5106, 0.9873000000000001, 0.9987, 0.8631000...",0.846297,positive,"[bit predictable quick sometimes fun read, sol..."
942,2021,Middle Grade & Children's,1507,City of the Plague God,Sarwat Chadda,4.23,3215,"[Fantasy, Middle Grade, Mythology, Young Adult...","[This was charming and witty, with complex cha...",[charming witty complex character action packe...,"[0.923, 0.7964, 0.802, 0.9559000000000001, 0.9...",0.844848,positive,[charming witty complex characters action pack...
943,2021,Middle Grade & Children's,1418,The Shape of Thunder,Jasmine Warga,4.17,4876,"[Middle Grade, Realistic Fiction, Contemporary...",[It’s Cora’s 12th birthday and she hasn’t spok...,[cora 12th birthday spoken best friend quin ye...,"[-0.9882000000000001, 0.432, 0.993100000000000...",0.338353,positive,[cora 12th birthday spoken best friend quin ye...
944,2021,Middle Grade & Children's,1152,The Last Fallen Star,Graci Kim,4.26,3746,"[Fantasy, Middle Grade, Mythology, Fiction, Ad...",[Why didn’t somebody tell me about this stuff ...,[somebody tell stuff sooner first thought read...,"[0.9987, 0.91, 0.9983000000000001, 0.928700000...",0.568970,positive,[somebody tell stuff sooner first thought read...
