Sarah Bernardo

CS 4120, Spring 2025

In [1]:
import lr_model as lr
import movies_data_utils as mdu
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# load Wikipedia plot movie data from https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots
NUM_TRAINING_EXAMPLES = 500
NUM_ITERATIONS = 1000
NUM_FEATURES = 5000
LEARNING_RATE = 0.1

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarahbernardo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load the data
plots, genres = mdu.get_plots_genres('movie_plots.json')

In [3]:
# use sklearn's TfidfVectorizer class to handle pre-processed data
tfidf_vectorizer = TfidfVectorizer(max_features=NUM_FEATURES)
# this will return a sparse matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(plots)
# change the sparse matrix to an array
tfidf_matrix = tfidf_matrix.toarray()

# split data into training/test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix[:NUM_TRAINING_EXAMPLES], 
                                                    genres[:NUM_TRAINING_EXAMPLES], 
                                                    test_size=0.2, random_state=42)



In [4]:
# function to help pre-processing
def get_nonzero_pct(matrix: np.ndarray, row: int):
    """ 
    Calculates percentage of non-zero elements in a row in a matrix.
    Args:
        matrix (np.ndarray): the matrix
        row (int): row for which you want to calculate the percentage
    """
    mshape = matrix.shape

    non_zero_cnt = np.count_nonzero(matrix[row])
    return non_zero_cnt / mshape[1] *100


In [5]:
# see what the data looks like by printing first 10 elements, corresponding labels, and % of non-zero elements in first of 3 rows of the training set
print('X_train:', X_train[:3])
print('y_train:', y_train[:3])

for i in range(3):
    nonz_pct = get_nonzero_pct(X_train, i)
    print(f'Row {i} is {nonz_pct}% non-zero elements')

X_train: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_train: ['Children', 'Documentaries', 'Documentaries']
Row 0 is 0.27999999999999997% non-zero elements
Row 1 is 0.2% non-zero elements
Row 2 is 0.22% non-zero elements


In [6]:
# initialize LR model.
logreg = lr.LogisticRegression(LEARNING_RATE, NUM_ITERATIONS)

# train model and measure how long the process took
start_time = time.time()
logreg.train(X_train, y_train)
end_time = time.time()

print(f'This logistic regression model took {(end_time-start_time)} seconds to train on {NUM_TRAINING_EXAMPLES} training examples for {NUM_ITERATIONS} iterations with {NUM_FEATURES} features.')

This logistic regression model took 5.060145854949951 seconds to train on 500 training examples for 1000 iterations with 5000 features.


In [7]:
# predict the genre for one example from your test set.

plot_example = X_test[0,:].reshape((-1,1))
genre_example = np.array(y_test)[0]

ex_pred = logreg.predict(plot_example)
print('Prediction for example:', ex_pred)

Prediction for example: Comedies


In [8]:
# evaluate Logistic Regression model with accuracy, precision, recall, and f1-score for predictions.
y_pred = []
for plot in X_test:
    plot = plot.reshape((-1,1))
    genre_prediction = logreg.predict(plot)
    y_pred.append(genre_prediction)


In [9]:
from sklearn.metrics import accuracy_score

# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

# precision score
precision = precision_score(y_test, y_pred, average='weighted')
print('precision:', precision)

# recall score
recall = recall_score(y_test, y_pred, average='weighted')
print('recall:', recall)

# f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print('f1:', f1)

accuracy: 0.46
precision: 0.4766031746031746
recall: 0.46
f1: 0.4429040598869821


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**This model is less than ideal, as none of these metrics exceed 48%, but it's not too much worse than my Naive Bayes model. It seems most optimal for precision, as the precision measure consistently returns the highest value. It seems that increasing the number of features is most impactful/most helpful, as that increased the measurements by the most. Overall, there is plenty of room for improvement, but this model is better than randomly guessing the genres.**

Summary
----
Fill in the following table for Logistic Regression:

| num training examples | num iterations | num features | accuracy | precision | recall | f1-score | training time |
| - | - | - | - | - | - | - | - |
| 500 | 1000 | 1000 | 0.37 | 0.417 | 0.37 | 0.369 | 1.81s |
| 1000 | 1000 | 1000 | 0.435 | 0.402 | 0.435 | 0.415 | 2.42s |
| 1500 | 1000 | 1000 | 0.45 | 0.437 | 0.45 | 0.438 | 3.85s |
| 500 | 3000 | 1000 | 0.37 | 0.407 | 0.37 | 0.363 | 5.19s |
| 500 | 5000 | 1000 | 0.37 | 0.401 | 0.37 | 0.363 | 6.05s |
| 500 | 1000 | 3000 | 0.44 | 0.464 | 0.44 | 0.425 | 3.86s |
| 500 | 1000 | 5000 | 0.46 | 0.477 | 0.46 | 0.443 | 5.91s |


In [10]:
NUM_TRAINING_EXAMPLES = 5000
NUM_ITERATIONS = 5000
NUM_FEATURES = 10000

final_lr = lr.LogisticRegression(LEARNING_RATE, NUM_ITERATIONS)

# PROVIDED
tfidf_vectorizer = TfidfVectorizer(max_features=NUM_FEATURES)
# this will return a sparse matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(plots)
# change the sparse matrix to an array
tfidf_matrix = tfidf_matrix.toarray()

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix[:NUM_TRAINING_EXAMPLES], 
                                                    genres[:NUM_TRAINING_EXAMPLES], 
                                                    test_size=0.2, random_state=42)

logreg.train(X_train, y_train)

y_pred = []
for plot in X_test:
    plot = plot.reshape((-1,1))
    genre_prediction = logreg.predict(plot)
    y_pred.append(genre_prediction)

# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

# precision score
precision = precision_score(y_test, y_pred, average='weighted')
print('precision:', precision)

# recall score
recall = recall_score(y_test, y_pred, average='weighted')
print('recall:', recall)

# f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print('f1:', f1)

mdu.save_predictions(y_test, y_pred, "lr_predictions.txt")

  loss = np.sum(-(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))) / num_classes
  loss = np.sum(-(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))) / num_classes


accuracy: 0.549
precision: 0.532916106240845
recall: 0.549
f1: 0.5388365909599984


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
