In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import datetime

# start time
start_time = datetime.datetime.now()

# load the reviews from the JSON file into a Pandas DataFrame
reviews = []
with open('goodreads_reviews_young_adult.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review['review_text'], review['rating']))
df = pd.DataFrame(reviews, columns=['text', 'rating'])

# define a function to map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating >= 3:
        return 'positive'
    else:
        return 'negative'

# map the ratings to sentiment labels
df['sentiment'] = df['rating'].apply(map_rating_to_sentiment)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# vectorize the text data using a bag-of-words model
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# train a naive bayes classifier on the training data
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# evaluate the performance of the classifier on the testing data
accuracy = clf.score(X_test_vec, y_test)
print("Accuracy of the classifier on the testing set: {:.2f}%".format(accuracy * 100))

# end time
end_time = datetime.datetime.now()

# total time
delta_time = end_time - start_time
print(f"Elapsed time: {delta_time.total_seconds()} seconds")

Accuracy of the classifier on the testing set: 86.83%
Elapsed time: 373.895737 seconds


In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import datetime

# load the reviews from the JSON file into a Pandas DataFrame
reviews = []
with open('goodreads_reviews_young_adult.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review['review_text'], review['rating'], review['book_id']))
df = pd.DataFrame(reviews, columns=['text', 'rating', 'book_id'])

# define a function to map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating >= 3:
        return 'positive'
    else:
        return 'negative'

# map the ratings to sentiment labels
df['sentiment'] = df['rating'].apply(map_rating_to_sentiment)

# vectorize the text data using a bag-of-words model
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(df['text'])

# train a naive bayes classifier on the entire dataset
clf = MultinomialNB()
clf.fit(X_vec, df['sentiment'])

# define a function to predict the sentiment of a book's reviews
def predict_sentiment(book_id):
    # get the reviews for the specified book_id
    reviews = df[df['book_id'] == book_id]['text']
    # vectorize the reviews using the same vectorizer used for training
    reviews_vec = vectorizer.transform(reviews)
    # make predictions on the vectorized reviews
    predictions = clf.predict(reviews_vec)
    # count the number of positive and negative predictions
    pos_count = sum(predictions == 'positive')
    neg_count = sum(predictions == 'negative')
    # determine the overall sentiment based on the majority of predictions
    if pos_count > neg_count:
        return 'positive'
    elif neg_count > pos_count:
        return 'negative'
    else:
        return 'neutral'

# example usage
book_id = '8755776'
sentiment = predict_sentiment(book_id)
print(f"The sentiment of the reviews for book {book_id} is {sentiment}.")

The sentiment of the reviews for book 8755776 is positive.


In [3]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import datetime

# load the reviews from the JSON file into a Pandas DataFrame
reviews = []
with open('goodreads_reviews_young_adult.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review['book_id'], review['review_text'], review['rating']))
df = pd.DataFrame(reviews, columns=['book_id', 'text', 'rating'])

# define a function to map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating >= 3:
        return 'positive'
    else:
        return 'negative'

# map the ratings to sentiment labels
df['sentiment'] = df['rating'].apply(map_rating_to_sentiment)

# prompt the user to input a book_id
book_id = input("Enter a book_id: ")

# filter the reviews for the given book_id
reviews = df[df['book_id'] == book_id]['text']

# vectorize the text data using a bag-of-words model
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(df['text'])

# train a naive bayes classifier on the training data
clf = MultinomialNB()
clf.fit(X_vec, df['sentiment'])

# predict the sentiment of the reviews for the given book_id
X_test_vec = vectorizer.transform(reviews)
sentiments = clf.predict(X_test_vec)

# determine the overall sentiment of the reviews for the given book_id
if len(sentiments) == 0:
    print("No reviews found for the given book_id.")
else:
    positive_count = sum(sentiments == 'positive')
    negative_count = sum(sentiments == 'negative')
    if positive_count > negative_count:
        print("The reviews for the given book_id are generally positive.")
    elif positive_count < negative_count:
        print("The reviews for the given book_id are generally negative.")
    else:
        print("The reviews for the given book_id are mixed.")    

Enter a book_id: 33807229
The reviews for the given book_id are generally positive.


In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import datetime

# start time
start_time = datetime.datetime.now()

# load the reviews from the JSON file into a Pandas DataFrame
reviews = []
with open('goodreads_reviews_young_adult.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review['review_text'], review['rating']))
df = pd.DataFrame(reviews, columns=['text', 'rating'])

# define a function to map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating >= 3:
        return 'positive'
    else:
        return 'negative'

# map the ratings to sentiment labels
df['sentiment'] = df['rating'].apply(map_rating_to_sentiment)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# vectorize the text data using a TF-IDF model
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# train a naive bayes classifier on the training data
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# evaluate the performance of the classifier on the testing data
accuracy = clf.score(X_test_vec, y_test)
print("Accuracy of the classifier on the testing set: {:.2f}%".format(accuracy * 100))

# end time
end_time = datetime.datetime.now()

# total time
delta_time = end_time - start_time
print(f"Elapsed time: {delta_time.total_seconds()} seconds")

Accuracy of the classifier on the testing set: 86.27%
Elapsed time: 2436.467141 seconds
