# Metacritic Score Datasets (Feb 2023) - Pre-processing

## Load Packages and Functions

In [30]:
import numpy as np
import pandas as pd
from word2number import w2n

In [32]:
import sys
sys.path.append('/Users/shantellesmith/Github/movie_rating_prediction/src')
from language_processing import *

In [7]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

## Load Data and Check Content/Size

In [8]:
df_movies = pd.read_csv('../../data/raw/movies.csv')
df_games = pd.read_csv('../../data/raw/games.csv')
df_tv = pd.read_csv('../../data/raw/tv.csv')
# df_music = pd.read_csv('/home/jupyter-shantelle/music.csv')

In [9]:
df_movies.drop('id',axis=1,inplace=True)

In [10]:
df_movies['user_score'] = df_movies['user_score'].replace('tbd',np.nan).astype('float')

In [11]:
df_movies['rating'] = df_movies['rating'].replace(['PG--13','PG-13`'],'PG-13').replace(['NR','Unrated'],'Not Rated')
df_movies['rating'] = df_movies['rating'].replace('X','NC-17') #'X' is an outdated rating replaced with 'NC-17'
df_movies['rating'].unique()

array(['TV-14', 'PG-13', 'G', 'TV-PG', 'PG', 'R', 'Passed', 'Not Rated',
       'TV-G', 'TV-MA', 'Approved', nan, 'GP', 'M', 'M/PG', 'NC-17',
       'Open', 'MA-17', 'TV-Y7', 'TV-Y7-FV'], dtype=object)

In [12]:
df_movies['release_date'] = df_movies['release_date'].replace(['TBA','TBD 2022 or 2023'],np.nan)
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])

In [13]:
df_movies['release_year'] = df_movies['release_date'].dt.year
df_movies['release_month'] = df_movies['release_date'].dt.month
df_movies['release_quarter'] = df_movies['release_date'].dt.quarter
df_movies['release_yearweek'] = df_movies['release_date'].dt.weekofyear

  after removing the cwd from sys.path.


In [14]:
df_movies.fillna(-1,inplace=True)

In [15]:
df_movies['title_len'] = [len(i) for i in df_movies['title']]
df_movies['summary_len'] = [len(i) for i in df_movies['summary'].astype('str')]

df_movies['title_wordlen'] = [len(i.split()) for i in df_movies['title']]
df_movies['summary_wordlen'] = [len(i.split()) for i in df_movies['summary'].astype('str')]

df_movies['title_avg_wordlength'] = df_movies['title_len']/df_movies['title_wordlen']
df_movies['summary_avg_wordlength'] = df_movies['summary_len']/df_movies['summary_wordlen']

In [16]:
df_movies['title_capcount'] = [count_capital_words(i) for i in df_movies['title']]
df_movies['summary_capcount'] = [count_capital_words(i) for i in df_movies['summary'].astype('str')]

In [17]:
df_movies['title_punccount'] = [count_punctuations(i) for i in df_movies['title']]
df_movies['summary_punccount'] = [count_punctuations(i) for i in df_movies['summary'].astype('str')]

In [18]:
df_movies['title_sentcount'] = [count_sent(i) for i in df_movies['title']]
df_movies['summary_sentcount'] = [count_sent(i) for i in df_movies['summary'].astype('str')]

In [19]:
df_movies['title_uniq'] = [count_unique_words(i) for i in df_movies['title']]
df_movies['summary_uniq'] = [count_unique_words(i) for i in df_movies['summary'].astype('str')]

In [20]:
df_movies['title_uniq_vs_words'] = df_movies['title_uniq']/df_movies['title_wordlen']
df_movies['summary_uniq_vs_words'] = df_movies['summary_uniq']/df_movies['summary_wordlen']

In [21]:
df_movies['title_stopcount'] = [count_stopwords(i) for i in df_movies['title']]
df_movies['summary_stopcount'] = [count_stopwords(i) for i in df_movies['summary'].astype('str')]

In [22]:
df_movies['title_stopwords_vs_words'] = df_movies['title_stopcount']/df_movies['title_wordlen']
df_movies['summary_stopwords_vs_words'] = df_movies['summary_stopcount']/df_movies['summary_wordlen']

In [23]:
df_movies['title_nostopwords'] = [remove_stopwords(i) for i in df_movies['title']]
df_movies['summary_nostopwords'] = [remove_stopwords(i) for i in df_movies['summary'].astype('str')]

In [24]:
#defining the function for lemmatization
df_movies['title_nostopwords_lemm'] = [lemmatizer(i) for i in df_movies['title_nostopwords']]
df_movies['summary_nostopwords_lemm'] = [lemmatizer(i) for i in df_movies['summary_nostopwords']]

In [25]:
df_movies['title_clean'] = [' '.join(i) for i in df_movies['title_nostopwords_lemm']]
df_movies['summary_clean'] = [' '.join(i) for i in df_movies['summary_nostopwords_lemm']]

In [33]:
df_movies = remove_numwords(df_movies,'title_nostopwords_lemm','title_clean','title_clean_num')
df_movies = remove_numwords(df_movies,'summary_nostopwords_lemm','summary_clean','summary_clean_num')

NameError: name 'w2n' is not defined

In [None]:
df_movies.drop(['title_nostopwords','summary_nostopwords','title_nostopwords_lemm','summary_nostopwords_lemm'],
              axis=1,inplace=True)

 ## BERTopic Modelling on Summary

In [None]:
topic_model_title, embeddings_title = bertmodel_prep(df_movies, 'title_clean_num')
topic_model_summary, embeddings_summary = bertmodel_prep(df_movies, 'summary_clean_num')

In [None]:
df_movies['title_topic'] = get_topic_preds(df_movies, 'title_clean_num',topic_model_title, embeddings_title)
df_movies['summary_topic'] = get_topic_preds(df_movies, 'summary_clean_num',topic_model_summary, embeddings_summary)

    Topic  Count                               Name
0      -1  10091              -1_love_man_girl_life
1       0   2372                0_miss_mr_love_life
2       1    339              1_night_day_last_good
3       2    296              2_dead_death_die_live
4       3    261     3_everything_thing_nothing_man
5       4    210        4_water_tale_without_beyond
6       5    208            5_sky_light_white_black
7       6    195                    6_12_10_13_year
8       7    173             7_sweet_man_part_whats
9       8    157               8_son_child_kid_like
10      9    140  9_space_beautiful_people_american
11     10    136     10_movie_fantastic_big_captain
12     11    126            11_song_school_live_art
13     12    126           12_road_street_long_hard
14     13    111                13_fat_big_hate_gay
15     14     92        14_crime_hate_future_second
16     15     91              15_men_bad_little_boy
17     16     81           16_age_never_future_dead
18     17   

100%|██████████| 18/18 [00:00<00:00, 172.34it/s]

.
├─miss_mr_song_man_nothing
│    ├─song_school_gay_fat_late
│    │    ├─■──song_school_live_art_gay ── Topic: 11
│    │    └─■──fat_big_hate_gay_plan ── Topic: 13
│    └─miss_mr_man_god_nothing
│         ├─miss_mr_man_nothing_beautiful
│         │    ├─■──everything_thing_nothing_man_im ── Topic: 3
│         │    └─miss_mr_life_love_beautiful
│         │         ├─■──miss_mr_love_life_city ── Topic: 0
│         │         └─■──space_beautiful_people_american_day ── Topic: 9
│         └─■──god_last_ii_captain_nation ── Topic: 17
└─night_movie_day_road_men
     ├─movie_road_men_dead_death
     │    ├─movie_road_men_dead_death
     │    │    ├─dead_death_die_live_dont
     │    │    │    ├─■──another_im_iii_angel_death ── Topic: 18
     │    │    │    └─■──dead_death_die_live_dont ── Topic: 2
     │    │    └─movie_road_men_son_bad
     │    │         ├─men_son_bad_child_kid
     │    │         │    ├─■──men_bad_little_boy_old ── Topic: 15
     │    │         │    └─■──son_child_kid_like_




    Topic  Count                                   Name
0      -1  10303                -1_life_new_story_world
1       0   1842              0_life_mother_family_love
2       1    627          1_detective_police_cop_murder
3       2    509             2_music_band_rock_musician
4       3    412               3_nazi_war_german_jewish
5       4    299             4_earth_alien_planet_space
6       5    289            5_china_japanese_master_art
7       6    212               6_team_player_sport_game
8       7    178             7_art_artist_portrait_work
9       8    173      8_agent_secret_mission_government
10      9    114            9_soldier_war_army_military
11     10    101              10_novel_love_based_story
12     11     98        11_animal_disney_adventure_bear
13     12     92              12_comedy_film_funny_show
14     13     76  13_political_president_american_state


100%|██████████| 13/13 [00:00<00:00, 167.93it/s]

.
├─music_band_art_rock_artist
│    ├─■──comedy_film_funny_show_hilarious ── Topic: 12
│    └─music_band_art_rock_artist
│         ├─team_player_sport_game_school
│         │    ├─■──political_president_american_state_america ── Topic: 13
│         │    └─■──team_player_sport_game_school ── Topic: 6
│         └─music_band_art_artist_rock
│              ├─■──music_band_rock_musician_musical ── Topic: 2
│              └─■──art_artist_portrait_work_photographer ── Topic: 7
└─life_family_new_find_love
     ├─detective_agent_police_murder_cop
     │    ├─■──agent_secret_mission_government_must ── Topic: 8
     │    └─■──detective_police_cop_murder_crime ── Topic: 1
     └─life_family_new_love_mother
          ├─life_family_love_mother_new
          │    ├─life_family_love_mother_new
          │    │    ├─life_family_mother_love_new
          │    │    │    ├─■──novel_love_based_story_woman ── Topic: 10
          │    │    │    └─■──life_mother_family_love_new ── Topic: 0
          │    │   




In [None]:
topic_model_title.visualize_topics(top_n_topics=20)

In [None]:
topic_model_summary.visualize_topics(top_n_topics=20)

In [None]:
topic_model_title.save("my_topic_model_title")


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [None]:
topic_model_summary.save("my_topic_model_summary")

In [None]:
# Get top 10 terms for a topic
topic_terms = []
for i in range(0,topic_model.get_topic_info().index[-1]):
    topic_terms.append(topic_model.get_topic(i))

In [None]:
list_topics=[]
for i in range(0,17):
    list_topics.append(np.ones(10)*i)
topics_names = pd.DataFrame(list_topics).transpose().melt().value

In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=20)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [None]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

In [None]:
# Visualize probability distribution
topic_model.visualize_distribution(topic_model.probabilities_[0], min_probability=0.001) 

In [None]:
# df_movies.to_csv('df_movies_processed.csv')