# Metacritic Score Datasets (Feb 2023) - Pre-processing

## Load Packages and Functions

In [1]:
import numpy as np
import pandas as pd
from word2number import w2n
import pickle

In [2]:
import sys
sys.path.append('/Users/shantellesmith/Github/movie_rating_prediction/src')
from language_processing import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shantellesmith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shantellesmith/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

## Load Data and Check Content/Size

In [4]:
df_movies = pd.read_csv('../../data/raw/movies.csv')
df_games = pd.read_csv('../../data/raw/games.csv')
df_tv = pd.read_csv('../../data/raw/tv.csv')
# df_music = pd.read_csv('/home/jupyter-shantelle/music.csv')

In [5]:
df_movies.drop('id',axis=1,inplace=True)

In [6]:
df_movies['user_score'] = df_movies['user_score'].replace('tbd',np.nan).astype('float')

In [7]:
df_movies['rating'] = df_movies['rating'].replace(['PG--13','PG-13`'],'PG-13').replace(['NR','Unrated'],'Not Rated')
df_movies['rating'] = df_movies['rating'].replace('X','NC-17') #'X' is an outdated rating replaced with 'NC-17'
df_movies['rating'].unique()

array(['TV-14', 'PG-13', 'G', 'TV-PG', 'PG', 'R', 'Passed', 'Not Rated',
       'TV-G', 'TV-MA', 'Approved', nan, 'GP', 'M', 'M/PG', 'NC-17',
       'Open', 'MA-17', 'TV-Y7', 'TV-Y7-FV'], dtype=object)

In [8]:
df_movies['release_date'] = df_movies['release_date'].replace(['TBA','TBD 2022 or 2023'],np.nan)
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])

In [9]:
df_movies['release_year'] = df_movies['release_date'].dt.year
df_movies['release_month'] = df_movies['release_date'].dt.month
df_movies['release_quarter'] = df_movies['release_date'].dt.quarter
df_movies['release_yearday'] = df_movies['release_date'].dt.dayofyear

In [10]:
df_movies.fillna(-1,inplace=True)

In [11]:
df_movies['title_len'] = [len(i) for i in df_movies['title']]
df_movies['summary_len'] = [len(i) for i in df_movies['summary'].astype('str')]

df_movies['title_wordlen'] = [len(i.split()) for i in df_movies['title']]
df_movies['summary_wordlen'] = [len(i.split()) for i in df_movies['summary'].astype('str')]

df_movies['title_avg_wordlength'] = df_movies['title_len']/df_movies['title_wordlen']
df_movies['summary_avg_wordlength'] = df_movies['summary_len']/df_movies['summary_wordlen']

In [12]:
df_movies['title_capcount'] = [count_capital_words(i) for i in df_movies['title']]
df_movies['summary_capcount'] = [count_capital_words(i) for i in df_movies['summary'].astype('str')]

In [13]:
df_movies['title_punccount'] = [count_punctuations(i) for i in df_movies['title']]
df_movies['summary_punccount'] = [count_punctuations(i) for i in df_movies['summary'].astype('str')]

In [14]:
df_movies['title_sentcount'] = [count_sent(i) for i in df_movies['title']]
df_movies['summary_sentcount'] = [count_sent(i) for i in df_movies['summary'].astype('str')]

In [15]:
df_movies['title_uniq'] = [count_unique_words(i) for i in df_movies['title']]
df_movies['summary_uniq'] = [count_unique_words(i) for i in df_movies['summary'].astype('str')]

In [16]:
df_movies['title_uniq_vs_words'] = df_movies['title_uniq']/df_movies['title_wordlen']
df_movies['summary_uniq_vs_words'] = df_movies['summary_uniq']/df_movies['summary_wordlen']

In [17]:
df_movies['title_stopcount'] = [count_stopwords(i) for i in df_movies['title']]
df_movies['summary_stopcount'] = [count_stopwords(i) for i in df_movies['summary'].astype('str')]

In [18]:
df_movies['title_stopwords_vs_words'] = df_movies['title_stopcount']/df_movies['title_wordlen']
df_movies['summary_stopwords_vs_words'] = df_movies['summary_stopcount']/df_movies['summary_wordlen']

In [19]:
df_movies['title_nostopwords'] = [remove_stopwords(i) for i in df_movies['title']]
df_movies['summary_nostopwords'] = [remove_stopwords(i) for i in df_movies['summary'].astype('str')]

In [20]:
#defining the function for lemmatization
df_movies['title_nostopwords_lemm'] = [lemmatizer(i) for i in df_movies['title_nostopwords']]
df_movies['summary_nostopwords_lemm'] = [lemmatizer(i) for i in df_movies['summary_nostopwords']]

In [21]:
df_movies['title_clean'] = [' '.join(i) for i in df_movies['title_nostopwords_lemm']]
df_movies['summary_clean'] = [' '.join(i) for i in df_movies['summary_nostopwords_lemm']]

In [22]:
df_movies = remove_numwords(df_movies,'title_nostopwords_lemm','title_clean','title_clean_num')
df_movies = remove_numwords(df_movies,'summary_nostopwords_lemm','summary_clean','summary_clean_num')

In [23]:
df_movies.drop(['title_nostopwords','summary_nostopwords','title_nostopwords_lemm','summary_nostopwords_lemm'],
              axis=1,inplace=True)

 ## BERTopic Modelling on Summary

In [None]:
topic_model_title, embeddings_title = bertmodel_prep(df_movies, 'title_clean_num')
topic_model_summary, embeddings_summary = bertmodel_prep(df_movies, 'summary_clean_num')

In [None]:
df_movies['title_topic'] = get_topic_preds(df_movies, 'title_clean_num',topic_model_title, embeddings_title)
df_movies['summary_topic'] = get_topic_preds(df_movies, 'summary_clean_num',topic_model_summary, embeddings_summary)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


    Topic  Count                          Name
0      -1  10119    -1_life_story_man_american
1       0   2341          0_girl_dead_love_boy
2       1    613        1_real_age_great_child
3       2    293         2_miss_love_back_hall
4       3    220            3_movie_iii_ii_big
5       4    192         4_david_bob_john_name
6       5    191  5_deep_without_yellow_little
7       6    164            6_12_year_13_whats
8       7    161     7_sweet_part_yellow_whats
9       8    140       8_red_color_black_white
10      9    138          9_day_last_next_part
11     10    136  10_people_american_white_bob
12     11    117     11_song_school_within_gay
13     12    113         12_father_fat_big_get
14     13    107     13_night_good_school_come
15     14    100      14_street_long_hard_back
16     15     72       15_war_new_world_within
17     16     56   16_mystery_mine_city_little
18     17     52   17_behind_light_dark_yellow


100%|██████████| 17/17 [00:00<00:00, 299.58it/s]

.
├─girl_love_dead_father_boy
│    ├─david_bob_john_name_hall
│    │    ├─■──miss_love_back_hall_mystery ── Topic: 2
│    │    └─■──david_bob_john_name_hall ── Topic: 4
│    └─girl_love_dead_father_boy
│         ├─girl_love_dead_father_boy
│         │    ├─■──people_american_white_bob_going ── Topic: 10
│         │    └─girl_love_dead_father_boy
│         │         ├─■──girl_dead_love_boy_father ── Topic: 0
│         │         └─■──father_fat_big_get_gay ── Topic: 12
│         └─■──real_age_great_child_go ── Topic: 1
└─night_movie_day_red_color
     ├─night_day_street_war_12
     │    ├─street_war_12_year_13
     │    │    ├─■──street_long_hard_back_lost ── Topic: 14
     │    │    └─war_12_year_13_going
     │    │         ├─■──war_new_world_within_last ── Topic: 15
     │    │         └─■──12_year_13_whats_short ── Topic: 6
     │    └─night_day_last_next_good
     │         ├─■──day_last_next_part_night ── Topic: 9
     │         └─■──night_good_school_come_live ── Topic: 13
     └─




    Topic  Count                            Name
0      -1   8806         -1_life_new_world_story
1       0   2508       0_life_love_family_mother
2       1   1798     1_police_murder_agent_crime
3       2    533      2_music_band_rock_musician
4       3    422        3_nazi_war_german_jewish
5       4    288     4_china_japanese_master_art
6       5    284      5_earth_alien_planet_space
7       6    235        6_team_sport_player_game
8       7    219   7_art_artist_portrait_fashion
9       8    121  8_disney_animal_adventure_bear
10      9    111        9_comedy_comic_film_show


100%|██████████| 9/9 [00:00<00:00, 289.38it/s]

.
├─life_new_family_find_young
│    ├─earth_alien_planet_space_world
│    │    ├─■──disney_animal_adventure_bear_named ── Topic: 8
│    │    └─■──earth_alien_planet_space_human ── Topic: 5
│    └─life_family_new_find_young
│         ├─life_family_new_find_young
│         │    ├─■──china_japanese_master_art_warrior ── Topic: 4
│         │    └─life_family_new_find_love
│         │         ├─life_new_family_find_love
│         │         │    ├─■──life_love_family_mother_new ── Topic: 0
│         │         │    └─■──police_murder_agent_crime_detective ── Topic: 1
│         │         └─■──nazi_war_german_jewish_story ── Topic: 3
│         └─■──team_sport_player_game_school ── Topic: 6
└─music_band_rock_artist_art
     ├─music_band_rock_artist_art
     │    ├─■──art_artist_portrait_fashion_work ── Topic: 7
     │    └─■──music_band_rock_musician_musical ── Topic: 2
     └─■──comedy_comic_film_show_funny ── Topic: 9






In [None]:
topic_model_title.visualize_topics(top_n_topics=20)

In [None]:
topic_model_summary.visualize_topics(top_n_topics=20)

In [None]:
topic_model_title.save("topic_model_title")
with open('../../models/topic_model_title.pkl', 'wb') as f:
    pickle.dump(topic_model_title, f)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [None]:
topic_model_summary.save("topic_model_summary")
with open('../../models/topic_model_summary.pkl', 'wb') as f:
    pickle.dump(topic_model_summary, f)

In [None]:
# Get top 10 terms for a topic
topic_terms = [
    topic_model_title.get_topic(i)
    for i in range(topic_model_title.get_topic_info().index[-1])
]

In [None]:
list_topics = [np.ones(10)*i for i in range(17)]
topics_names = pd.DataFrame(list_topics).transpose().melt().value

In [None]:
# Visualize top topic keywords
topic_model_title.visualize_barchart(top_n_topics=20)

In [None]:
# Visualize term rank decrease
topic_model_title.visualize_term_rank()

In [None]:
# Visualize similarity using heatmap
topic_model_title.visualize_heatmap()

In [None]:
# Visualize probability distribution
topic_model_title.visualize_distribution(topic_model_title.probabilities_[0], min_probability=0.001) 

In [None]:
# df_movies.to_csv('df_movies_processed.csv')

In [None]:
sentence_model = sentence_transformers.SentenceTransformer("all-MiniLM-L6-v2")
embeddings_title = sentence_model.encode(["Star Wars The Rise of SkyWalker"])
topics,probs=topic_model_title.transform(["Star Wars The Rise of SkyWalker"], embeddings_title)
print(topics)

[[-8.07483420e-02 -4.26440649e-02  2.04296708e-02 -2.39919238e-02
  -1.51795121e-02  6.24583960e-02  3.99060324e-02  6.80941939e-02
   4.44751568e-02  6.48096129e-02  3.45850289e-02  4.62063253e-02
   5.33513725e-03  2.39819065e-02  3.81561443e-02  4.34729503e-03
   2.29914375e-02 -3.16627212e-02  5.51973768e-02 -1.30140828e-02
   1.40939001e-02  6.95031136e-02  3.08114197e-02 -7.28892721e-03
   4.69055511e-02  2.46228315e-02  2.91309636e-02 -2.32314542e-02
  -6.05720319e-02 -1.98536683e-02  1.48622971e-02  2.16123555e-02
  -5.76586649e-02  2.62227263e-02  5.74486854e-04 -5.47030829e-02
   5.86465076e-02  9.69386324e-02  6.90701529e-02 -5.21717519e-02
  -2.42208019e-02 -7.03255162e-02  2.08923109e-02  6.28895313e-02
   2.29389165e-02 -5.33979945e-02  4.44140136e-02 -2.57833358e-02
   4.46103476e-02  6.50198534e-02  5.64159118e-02 -7.77066546e-03
   7.07703503e-03 -3.45906205e-02  4.04439308e-02 -7.44353905e-02
  -9.34063643e-03 -8.60124454e-02  9.28823054e-02 -2.03804821e-02
  -3.46014