In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import json
import base64

In [28]:
with open("episodes.json") as f:
    episodes = json.load(f)
with open("episode_id_to_idx.json") as f:
    episode_id_to_idx = json.load(f)
with open("genre_to_episodes.json") as f:
    genre_to_episodes = json.load(f)
with open("tf_idf_description.json") as f:
    tf_idf_desc_vectors = json.load(f, object_hook=json_numpy_obj_hook, encoding='utf8')
with open("idf_description.json") as f:
    idf_description = json.load(f, object_hook=json_numpy_obj_hook, encoding='utf8')
with open("terms_description.json") as f:
    terms_description = json.load(f)

In [3]:
def json_numpy_obj_hook(dct):
    """Decodes a previously encoded numpy ndarray with proper shape and dtype.
    :param dct: (dict) json encoded ndarray
    :return: (ndarray) if input was an encoded ndarray
    """
    if isinstance(dct, dict) and '__ndarray__' in dct:
        data = base64.b64decode(dct['__ndarray__'])
        return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
    return dct

In [43]:
genre_to_idx = {genre: g for g, genre in enumerate(genre_to_episodes.keys())}
print(genre_to_idx)

{'Arts & Entertainment': 0, 'Business & Technology': 1, 'Comedy': 2, 'True & Crime': 3, 'Stories': 4, 'Educational': 5, 'Fiction': 6, 'Kids & Family': 7, 'Games': 8, 'Leisure': 9, 'Lifestyle & Health': 10, 'History': 11, 'Music': 12, 'News & Politics': 13, 'Religion & Spirituality': 14, 'Science': 15, 'Society & Culture': 16, 'Sports & Recreation': 17, 'Tv & Film': 18, 'Technology': 19}


In [32]:
# Create dataset
X = []
Y = []
for (episode_id, episode) in episodes.items():
    if len(episode["genres"]) == 1:
        X.append(tf_idf_desc_vectors[episode_id_to_idx[episode_id]])
        Y.append(genre_to_idx[episode["genres"][0]])
X = np.array(X)
Y = np.array(Y)

In [33]:
print(X.shape)
print(Y.shape)

(1103, 13715)
(1103,)


In [34]:
gnb = GaussianNB()
model = gnb.fit(X, Y)
pickle.dump(model, open("genre_classification_model.sav", 'wb'))

In [42]:
loaded_model = pickle.load(open("genre_classification_model.sav", 'rb'))
episode_desc_vectorizer = CountVectorizer(vocabulary=terms_description)
query_vec = episode_desc_vectorizer.fit_transform(["american toad"]).toarray().flatten()
query_tf_idf = query_vec*idf_description[np.newaxis, :]
prediction = loaded_model.predict(query_tf_idf)
print(list(genre_to_idx)[prediction[0]])

[16]
Society & Culture
