# Naive Bayes Make Synthetic Data

Use the probability distribution from the Naive Bayes model trained on IMDB data to generate synthetic data

## Imports

In [41]:
import os
import sys
import pandas as pd

sys.path.append("src")

from models.naive_bayes import NaiveBayesClassifier
from utils.nb_synthetic_data import generate_synthetic_from_nb
from sklearn.model_selection import train_test_split

## Load Model

In [18]:
BASE_DIR = r"C:\Users\rebec\Duke\ECE684\FilmGenreClassification"
MODEL_PATH = os.path.join(BASE_DIR, r"src\models\saved_models\nb_imdb_arh_trimmed.pkl")

nb_model = NaiveBayesClassifier.load(MODEL_PATH)

## Generate Synthetic Data

In [37]:
SAMPLES_PER_GENRE = 20000
MIN_LEN = 40              # min tokens per synthetic description
MAX_LEN = 120             # max tokens per synthetic description

synthetic_df = generate_synthetic_from_nb(
    nb_model=nb_model,
    samples_per_genre=SAMPLES_PER_GENRE,
    min_len=MIN_LEN,
    max_len=MAX_LEN,
    balanced_genres=True,     #same num of samples per genre
    random_seed=42,
)

synthetic_df.head()

Unnamed: 0,synopsis,genre
0,sides lands based capture plots two money ambi...,action
1,traveling brothers de world philly efforts sup...,action
2,lax unhinged fiancee cache custody manneken wo...,action
3,lieutenant four wukong cia magistrates forward...,action
4,costs joe defeat shape alien evidence appear c...,action


## Save Synthetic Data to CSV

In [39]:
OUTPUT_CSV = os.path.join(BASE_DIR, r"data\imdb_arh_synthetic\imdb_arh_synthetic_dataset.csv")

os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
synthetic_df.to_csv(OUTPUT_CSV, index=False)

print(f"Synthetic data saved to: {OUTPUT_CSV}")
print(f"Total rows: {len(synthetic_df)}")

Synthetic data saved to: C:\Users\rebec\Duke\ECE684\FilmGenreClassification\data\imdb_arh_synthetic\imdb_arh_synthetic_dataset.csv
Total rows: 60000


## Make Train/Val/Test Splits

In [49]:
#Make 70/15/15 train/val/test split

train_df, temp_df = train_test_split(
    synthetic_df,
    test_size=0.3,
    stratify=synthetic_df['genre'],
    random_state=42,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['genre'],
    random_state=42
)

In [55]:
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

print("\nTrain genre distribution:")
print(train_df['genre'].value_counts())

print("\nVal genre distribution:")
print(val_df['genre'].value_counts())

print("\nTest genre distribution:")
print(test_df['genre'].value_counts())

Train size: 42000
Val size: 9000
Test size: 9000

Train genre distribution:
genre
romance    14000
action     14000
horror     14000
Name: count, dtype: int64

Val genre distribution:
genre
horror     3000
romance    3000
action     3000
Name: count, dtype: int64

Test genre distribution:
genre
horror     3000
romance    3000
action     3000
Name: count, dtype: int64


## Save Train/Val/Test Splits to CSV

In [58]:
# Save to CSVs
TRAIN_PATH = os.path.join(BASE_DIR, r"data\imdb_arh_synthetic\imdb_arh_synthetic_train.csv")
VAL_PATH = os.path.join(BASE_DIR, r"data\imdb_arh_synthetic\imdb_arh_synthetic_val.csv")
TEST_PATH = os.path.join(BASE_DIR, r"data\imdb_arh_synthetic\imdb_arh_synthetic_test.csv")

In [66]:
train_df.to_csv((TRAIN_PATH), index=False)
val_df.to_csv((VAL_PATH), index=False)
test_df.to_csv((TEST_PATH), index=False)

print(f"\nSaved CSVs to data/imdb_arh_synthetic/")


Saved CSVs to data/imdb_arh_synthetic/
