In [142]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import datetime

# start time
start_time = datetime.datetime.now()

# load the reviews from the JSON file into a Pandas DataFrame
reviews = []
with open('goodreads_reviews_young_adult.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review['review_text'], review['rating']))
df = pd.DataFrame(reviews, columns=['text', 'rating'])

# define a function to map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    if rating >= 4:
        return 'positive'
    else:
        return 'negative'

# map the ratings to sentiment labels
df['sentiment'] = df['rating'].apply(map_rating_to_sentiment)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# vectorize the text data using a bag-of-words model
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# train a naive bayes classifier on the training data
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# evaluate the performance of the classifier on the testing data
accuracy = clf.score(X_test_vec, y_test)
print("Accuracy of the classifier on the testing set: {:.2f}%".format(accuracy * 100))

# end time
end_time = datetime.datetime.now()

# total time
delta_time = end_time - start_time
print(f"Elapsed time: {delta_time.total_seconds()} seconds")

Accuracy of the classifier on the testing set: 77.05%
Elapsed time: 354.564974 seconds


In [155]:
from sklearn.metrics import confusion_matrix

# predict the labels for the test set
y_pred = clf.predict(X_test_vec)

# create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=['positive', 'negative'])

print("Confusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[270966  46195]
 [ 63480  97339]]


In [143]:
vectorizer

CountVectorizer(stop_words='english')

In [144]:
import pickle

with open("vectorizer.pkl", 'wb') as fout:
    pickle.dump(vectorizer,fout)

In [145]:
with open('vectorizer.pkl', 'rb') as f:
    vectorizer2 = pickle.load(f)

In [146]:
vectorizer2

CountVectorizer(stop_words='english')

In [147]:
review = ["This is a boring book!"]

In [148]:
revVec = vectorizer2.transform(review)

In [149]:
revVec2 = vectorizer.transform(review)

In [150]:
clf.predict(revVec2)

array(['negative'], dtype='<U8')

In [151]:
with open("model.pkl", 'wb') as fout:
    pickle.dump(clf,fout)

In [152]:
with open('model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [153]:
clf2

MultinomialNB()

In [154]:
clf2.predict(revVec)

array(['negative'], dtype='<U8')