# Metacritic Score Datasets (Feb 2023) - Pre-processing

## Load Packages and Functions

In [1]:
import numpy as np
import pandas as pd
from word2number import w2n
import pickle

In [2]:
import sys
sys.path.append('/Users/shantellesmith/Github/movie_rating_prediction/src')
from language_processing import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shantellesmith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shantellesmith/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

## Load Data and Check Content/Size

In [3]:
df_movies = pd.read_csv('../../data/raw/movies.csv')
df_games = pd.read_csv('../../data/raw/games.csv')
df_tv = pd.read_csv('../../data/raw/tv.csv')
# df_music = pd.read_csv('/home/jupyter-shantelle/music.csv')

In [4]:
df_movies.drop('id',axis=1,inplace=True)

In [5]:
df_movies['user_score'] = df_movies['user_score'].replace('tbd',np.nan).astype('float')

In [6]:
df_movies['rating'] = df_movies['rating'].replace(['PG--13','PG-13`'],'PG-13').replace(['NR','Unrated'],'Not Rated')
df_movies['rating'] = df_movies['rating'].replace('X','NC-17') #'X' is an outdated rating replaced with 'NC-17'
df_movies['rating'].unique()

array(['TV-14', 'PG-13', 'G', 'TV-PG', 'PG', 'R', 'Passed', 'Not Rated',
       'TV-G', 'TV-MA', 'Approved', nan, 'GP', 'M', 'M/PG', 'NC-17',
       'Open', 'MA-17', 'TV-Y7', 'TV-Y7-FV'], dtype=object)

In [7]:
df_movies['release_date'] = df_movies['release_date'].replace(['TBA','TBD 2022 or 2023'],np.nan)
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])

In [8]:
df_movies['release_year'] = df_movies['release_date'].dt.year
df_movies['release_month'] = df_movies['release_date'].dt.month
df_movies['release_quarter'] = df_movies['release_date'].dt.quarter
df_movies['release_yearday'] = df_movies['release_date'].dt.dayofyear

In [9]:
df_movies.fillna(-1,inplace=True)

In [10]:
df_movies['title_len'] = [len(i) for i in df_movies['title']]
df_movies['summary_len'] = [len(i) for i in df_movies['summary'].astype('str')]

df_movies['title_wordlen'] = [len(i.split()) for i in df_movies['title']]
df_movies['summary_wordlen'] = [len(i.split()) for i in df_movies['summary'].astype('str')]

df_movies['title_avg_wordlength'] = df_movies['title_len']/df_movies['title_wordlen']
df_movies['summary_avg_wordlength'] = df_movies['summary_len']/df_movies['summary_wordlen']

In [11]:
df_movies['title_capcount'] = [count_capital_words(i) for i in df_movies['title']]
df_movies['summary_capcount'] = [count_capital_words(i) for i in df_movies['summary'].astype('str')]

In [12]:
df_movies['title_punccount'] = [count_punctuations(i) for i in df_movies['title']]
df_movies['summary_punccount'] = [count_punctuations(i) for i in df_movies['summary'].astype('str')]

In [13]:
df_movies['title_sentcount'] = [count_sent(i) for i in df_movies['title']]
df_movies['summary_sentcount'] = [count_sent(i) for i in df_movies['summary'].astype('str')]

In [14]:
df_movies['title_uniq'] = [count_unique_words(i) for i in df_movies['title']]
df_movies['summary_uniq'] = [count_unique_words(i) for i in df_movies['summary'].astype('str')]

In [15]:
df_movies['title_uniq_vs_words'] = df_movies['title_uniq']/df_movies['title_wordlen']
df_movies['summary_uniq_vs_words'] = df_movies['summary_uniq']/df_movies['summary_wordlen']

In [16]:
df_movies['title_stopcount'] = [count_stopwords(i) for i in df_movies['title']]
df_movies['summary_stopcount'] = [count_stopwords(i) for i in df_movies['summary'].astype('str')]

In [17]:
df_movies['title_stopwords_vs_words'] = df_movies['title_stopcount']/df_movies['title_wordlen']
df_movies['summary_stopwords_vs_words'] = df_movies['summary_stopcount']/df_movies['summary_wordlen']

In [18]:
df_movies['title_nostopwords'] = [remove_stopwords(i) for i in df_movies['title']]
df_movies['summary_nostopwords'] = [remove_stopwords(i) for i in df_movies['summary'].astype('str')]

In [19]:
#defining the function for lemmatization
df_movies['title_nostopwords_lemm'] = [lemmatizer(i) for i in df_movies['title_nostopwords']]
df_movies['summary_nostopwords_lemm'] = [lemmatizer(i) for i in df_movies['summary_nostopwords']]

In [20]:
df_movies['title_clean'] = [' '.join(i) for i in df_movies['title_nostopwords_lemm']]
df_movies['summary_clean'] = [' '.join(i) for i in df_movies['summary_nostopwords_lemm']]

In [21]:
df_movies = remove_numwords(df_movies,'title_nostopwords_lemm','title_clean','title_clean_num')
df_movies = remove_numwords(df_movies,'summary_nostopwords_lemm','summary_clean','summary_clean_num')

In [22]:
df_movies.drop(['title_nostopwords','summary_nostopwords','title_nostopwords_lemm','summary_nostopwords_lemm'],
              axis=1,inplace=True)

 ## BERTopic Modelling on Summary

In [23]:
topic_model_title, embeddings_title = bertmodel_prep(df_movies, 'title_clean_num')
topic_model_summary, embeddings_summary = bertmodel_prep(df_movies, 'summary_clean_num')

In [24]:
df_movies['title_topic'] = get_topic_preds(df_movies, 'title_clean_num',topic_model_title, embeddings_title)
df_movies['summary_topic'] = get_topic_preds(df_movies, 'summary_clean_num',topic_model_summary, embeddings_summary)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


    Topic  Count                              Name
0      -1  10954            -1_life_man_story_last
1       0    724             0_la_finding_age_last
2       1    326        1_miss_eye_finding_forever
3       2    252                2_john_bob_mr_name
4       3    239           3_girl_father_son_child
5       4    217         4_desert_late_sweet_white
6       5    203         5_im_everything_man_thing
7       6    199       6_water_tale_without_yellow
8       7    185         7_mystery_city_called_god
9       8    165           8_sky_light_behind_high
10      9    164                9_12_10_year_piece
11     10    154          10_movie_disaster_iii_ii
12     11    153            11_day_last_part_earth
13     12    131       12_dog_people_american_must
14     13    124         13_red_black_white_yellow
15     14    122            14_dead_death_die_must
16     15    116         15_night_good_school_come
17     16    116              16_war_art_world_men
18     17    113           17_h

100%|██████████| 24/24 [00:00<00:00, 388.10it/s]

.
├─night_day_dog_war_eye
│    ├─night_day_dog_war_dead
│    │    ├─night_day_last_good_come
│    │    │    ├─■──night_good_school_come_live ── Topic: 15
│    │    │    └─■──day_last_part_earth_night ── Topic: 11
│    │    └─dog_war_dead_red_sky
│    │         ├─dead_sky_light_red_die
│    │         │    ├─dead_die_red_death_water
│    │         │    │    ├─dead_death_die_12_10
│    │         │    │    │    ├─■──dead_death_die_must_live ── Topic: 14
│    │         │    │    │    └─■──12_10_year_piece_die ── Topic: 9
│    │         │    │    └─water_red_black_yellow_white
│    │         │    │         ├─■──water_tale_without_yellow_11 ── Topic: 6
│    │         │    │         └─red_black_white_yellow_desert
│    │         │    │              ├─■──desert_late_sweet_white_people ── Topic: 4
│    │         │    │              └─■──red_black_white_yellow_light ── Topic: 13
│    │         │    └─sky_light_snow_age_let
│    │         │         ├─■──sky_light_behind_high_let ── Topic: 8
│    │




    Topic  Count                                  Name
0      -1   8757               -1_life_new_story_world
1       0   2822             0_life_love_family_mother
2       1   1541              1_world_earth_must_human
3       2    479            2_music_band_rock_musician
4       3    478         3_detective_police_murder_cop
5       4    401              4_nazi_war_german_jewish
6       5    211              5_team_player_sport_game
7       6    163            6_art_artist_portrait_work
8       7    159           7_agent_secret_mission_must
9       8    119         8_comedy_show_comic_hilarious
10      9    107  9_political_state_president_american
11     10     88          10_soldier_war_army_military


100%|██████████| 10/10 [00:00<00:00, 370.45it/s]

.
├─music_band_rock_artist_documentary
│    ├─music_band_artist_rock_art
│    │    ├─■──art_artist_portrait_work_life ── Topic: 6
│    │    └─■──music_band_rock_musician_musical ── Topic: 2
│    └─team_comedy_player_sport_school
│         ├─team_player_sport_school_game
│         │    ├─■──political_state_president_american_america ── Topic: 9
│         │    └─■──team_player_sport_game_school ── Topic: 5
│         └─■──comedy_show_comic_hilarious_funny ── Topic: 8
└─life_family_new_love_find
     ├─life_family_love_new_find
     │    ├─■──soldier_war_army_military_home ── Topic: 10
     │    └─life_family_love_new_find
     │         ├─■──nazi_war_german_jewish_russian ── Topic: 4
     │         └─life_family_love_new_find
     │              ├─■──life_love_family_mother_new ── Topic: 0
     │              └─■──world_earth_must_human_find ── Topic: 1
     └─detective_agent_murder_police_cop
          ├─■──agent_secret_mission_must_government ── Topic: 7
          └─■──detective_police_




In [25]:
topic_model_title.visualize_topics(top_n_topics=20)

In [26]:
topic_model_summary.visualize_topics(top_n_topics=20)

In [27]:
topic_model_title.save("topic_model_title")
with open('../../models/topic_model_title.pkl', 'wb') as f:
    pickle.dump(topic_model_title, f)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [28]:
topic_model_summary.save("topic_model_summary")
with open('../../models/topic_model_summary.pkl', 'wb') as f:
    pickle.dump(topic_model_summary, f)

In [29]:
# Get top 10 terms for a topic
topic_terms = [
    topic_model_title.get_topic(i)
    for i in range(topic_model_title.get_topic_info().index[-1])
]

In [30]:
list_topics = [np.ones(10)*i for i in range(17)]
topics_names = pd.DataFrame(list_topics).transpose().melt().value

In [31]:
# Visualize top topic keywords
topic_model_title.visualize_barchart(top_n_topics=20)

In [32]:
# Visualize term rank decrease
topic_model_title.visualize_term_rank()

In [33]:
# Visualize similarity using heatmap
topic_model_title.visualize_heatmap()

In [34]:
# Visualize probability distribution
topic_model_title.visualize_distribution(topic_model_title.probabilities_[0], min_probability=0.001) 

In [39]:
#df_movies.to_csv('../../data/processed/df_movies_processed.csv')

In [36]:
sentence_model = sentence_transformers.SentenceTransformer("all-MiniLM-L6-v2")
embeddings_title = sentence_model.encode(["Star Wars The Rise of SkyWalker"])
topics,probs=topic_model_title.transform(["Star Wars The Rise of SkyWalker"], embeddings_title)
print(topics)

[-1]
