Sarah Bernardo

CS 4120, Spring 2025

In [1]:
# import model from the nb_model.py file and all other relevant packages
import nb_model as nb
import time
import movies_data_utils as mdu
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

NUM_TRAINING_EXAMPLES = 625

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load Wikipedia plot movie data from https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots
plots, genres = mdu.get_plots_genres('movie_plots.json')

# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(plots[:NUM_TRAINING_EXAMPLES], 
                                                    genres[:NUM_TRAINING_EXAMPLES], test_size=0.2, random_state=42)

In [3]:
# see what the data looks like by printing the first 3 rows of the training set
# and their corresponding labels

print('X_train:', X_train[:3])
print('y_train:', y_train[:3])

# print the shape of the training and testing data
print('X_train shape:', (len(X_train), 1))
print('y_train shape:', (len(y_train), 1))
print('X_test shape:',(len( X_test), 1))
print('y_test shape:',(len( y_test), 1))

X_train: ['amateur ghost hunter visit abandoned house investigation turn massacre leaving question detective psychologist', 'youtuber becomes obsessed figuring copycat archnemesis manages steal idea', 'girlfriend learns truth murky past con artist forced examine choice get root real identity']
y_train: ['Horror', 'Comedies', 'Comedies']
X_train shape: (500, 1)
y_train shape: (500, 1)
X_test shape: (125, 1)
y_test shape: (125, 1)


In [4]:
# initialize classifier object
movie_nb = nb.NaiveBayesClassifier()

# train the model using the training data, timing how long the process takes
start_time = time.time()
movie_nb.train(X_train, y_train)
end_time = time.time()

print('Training this model took', end_time-start_time, 'seconds.')

Training this model took 0.0047647953033447266 seconds.


#### Evaluation

- **Accuracy**: Total correct predictions.
- **Precision**: Percent of positive predictions that are truly positive.
- **Recall**: Percentage of truly positive values that model predicts to be positive.
- **F1 score**: A score that combines precision and recall

Evaluate the Naive Bayes model for accuracy, precision, recall, and F1 scores with different combinatons of preprocessing.

In [5]:
y_pred = []
for plot in X_test:
    genre_prediction = movie_nb.predict(plot)
    y_pred.append(genre_prediction)

In [6]:
# STOP WORDS REMOVED, LEMMATIZED
print('STOP WORDS REMOVED, LEMMATIZED')

print('Vocab size:', movie_nb.get_vocab_size())

# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

# precision score
precision = precision_score(y_test, y_pred, average='weighted')
print('precision:', precision)

# recall score
recall = recall_score(y_test, y_pred, average='weighted')
print('recall:', recall)

# f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print('f1:', f1)

STOP WORDS REMOVED, LEMMATIZED
Vocab size: 3544
accuracy: 0.408
precision: 0.38597460317460314
recall: 0.408
f1: 0.38307070707070706


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# NO STOP WORDS REMOVED, LEMMATIZED
print('NO STOP WORDS REMOVED, LEMMATIZED')

plots, genres = mdu.get_plots_genres('movie_plots.json',\
                                    stop_words = None)
X_train, X_test, y_train, y_test = train_test_split(plots[:NUM_TRAINING_EXAMPLES], 
                                                    genres[:NUM_TRAINING_EXAMPLES], test_size=0.2, random_state=42)

no_stop_nb = nb.NaiveBayesClassifier()

start_time = time.time()
no_stop_nb.train(X_train, y_train)
end_time = time.time()

print('Training this model took', end_time-start_time, 'seconds.')

print('Vocab size:', no_stop_nb.get_vocab_size())

y_pred = []
for plot in X_test:
    genre_prediction = no_stop_nb.predict(plot)
    y_pred.append(genre_prediction)

# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

# precision score
precision = precision_score(y_test, y_pred, average='weighted')
print('precision:', precision)

# recall score
recall = recall_score(y_test, y_pred, average='weighted')
print('recall:', recall)

# f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print('f1:', f1)

NO STOP WORDS REMOVED, LEMMATIZED
Training this model took 0.005822181701660156 seconds.
Vocab size: 3646
accuracy: 0.392
precision: 0.47259376688884885
recall: 0.392
f1: 0.34647116324535676


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# STOP WORDS REMOVED, NOT LEMMATIZED
print('STOP WORDS REMOVED, NOT LEMMATIZED')

plots, genres = mdu.get_plots_genres('movie_plots.json',\
                                    lemmatizer=None)
X_train, X_test, y_train, y_test = train_test_split(plots[:NUM_TRAINING_EXAMPLES], 
                                                    genres[:NUM_TRAINING_EXAMPLES], test_size=0.2, random_state=42)

no_lem_nb = nb.NaiveBayesClassifier()

start_time = time.time()
no_lem_nb.train(X_train, y_train)
end_time = time.time()

print('Training this model took', end_time-start_time, 'seconds.')

print('Vocab size:', no_lem_nb.get_vocab_size())

y_pred = []
for plot in X_test:
    genre_prediction = no_lem_nb.predict(plot)
    y_pred.append(genre_prediction)


# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

# precision score
precision = precision_score(y_test, y_pred, average='weighted')
print('precision:', precision)

# recall score
recall = recall_score(y_test, y_pred, average='weighted')
print('recall:', recall)

# f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print('f1:', f1)

STOP WORDS REMOVED, NOT LEMMATIZED
Training this model took 0.004684925079345703 seconds.
Vocab size: 3858
accuracy: 0.44
precision: 0.4003618242222894
recall: 0.44
f1: 0.41282500797448163


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# NO STOP WORDS REMOVED, NOT LEMMATIZED
print('NO STOP WORDS REMOVED, NOT LEMMATIZED')

plots, genres = mdu.get_plots_genres('movie_plots.json',\
                                    lemmatizer=None,\
                                    stop_words=None)
X_train, X_test, y_train, y_test = train_test_split(plots[:NUM_TRAINING_EXAMPLES], 
                                                    genres[:NUM_TRAINING_EXAMPLES], test_size=0.2, random_state=42)

none_nb = nb.NaiveBayesClassifier()

start_time = time.time()
none_nb.train(X_train, y_train)
end_time = time.time()

print('Training this model took', end_time-start_time, 'seconds.')

print('Vocab size:', none_nb.get_vocab_size())

y_pred = []
for plot in X_test:
    genre_prediction = none_nb.predict(plot)
    y_pred.append(genre_prediction)

# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

# precision score
precision = precision_score(y_test, y_pred, average='weighted')
print('precision:', precision)

# recall score
recall = recall_score(y_test, y_pred, average='weighted')
print('recall:', recall)

# f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print('f1:', f1)

NO STOP WORDS REMOVED, NOT LEMMATIZED
Training this model took 0.0074617862701416016 seconds.
Vocab size: 3965
accuracy: 0.384
precision: 0.412556586270872
recall: 0.384
f1: 0.3248672268907563


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**These results show that this model is not very good at predicting the genres of these movies, but tweaking the preprocessing strategies can improve some metrics like accuracy and recall by up to 5%. This model trains much faster than predicted, but all of the accuracy, precision, recall, and f1-scores are below 45%. It seems to perform best in terms of accuracy and precision, and its f1 scores are the least optimized measurement. Because there are 15 different classes, it is much better than randomly guessing, which would give us a 6.67% accuracy.**

Summary questions:
-----

| preprocessing strategies | accuracy | precision | recall | f1-score | vocabulary size | training time |
| - | - | - | - | - | - | - |
| No lemmatizing or stop word removal | 0.384 | 0.413 | 0.384 | 0.325 | 3,965 | 6.34e-3 |
| Lemmatizing only | 0.392 | 0.473 | 0.392 | 0.346 | 3,646 | 5.97e-3 |
| Stop word removal only | 0.44 | 0.4 | 0.44 | 0.413 | 3,858 | 4.81e-3 |
| Both lemmatizing and stop word removal | 0.408 | 0.386 | 0.408 | 0.383 | 3,544 | 5.23e-3 |

<p></p>
<p></p>
