In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
df = pd.read_csv("data/clean_movie_metadata.csv")

In [3]:
df

Unnamed: 0,id,title,overview,tagline,release_date,runtime,vote_average,vote_count,original_language,original_title,budget,revenue,genres,spoken_languages,popularity,production_companies,production_countries,imdb_id,belongs_to_collection
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,1995-10-30,81.0,7.7,5415.0,en,Toy Story,30000000,373554033.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'iso_639_1': 'en', 'name': 'English'}]",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",tt0114709,"{'id': 10194, 'name': 'Toy Story Collection', ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,1995-12-15,104.0,6.9,2413.0,en,Jumanji,65000000,262797249.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",tt0113497,
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,101.0,6.5,92.0,en,Grumpier Old Men,0,0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'iso_639_1': 'en', 'name': 'English'}]",11.712900,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",tt0113228,"{'id': 119050, 'name': 'Grumpy Old Men Collect..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,1995-12-22,127.0,6.1,34.0,en,Waiting to Exhale,16000000,81452156.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'iso_639_1': 'en', 'name': 'English'}]",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",tt0114885,
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,1995-02-10,106.0,5.7,173.0,en,Father of the Bride Part II,0,76578911.0,"[{'id': 35, 'name': 'Comedy'}]","[{'iso_639_1': 'en', 'name': 'English'}]",8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",tt0113041,"{'id': 96871, 'name': 'Father of the Bride Col..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44972,439050,Subdue,Rising and falling between a man and woman.,Rising and falling between a man and woman,,90.0,4.0,1.0,fa,رگ خواب,0,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...","[{'iso_639_1': 'fa', 'name': 'فارسی'}]",0.072051,[],"[{'iso_3166_1': 'IR', 'name': 'Iran'}]",tt6209470,
44973,111109,Century of Birthing,An artist struggles to finish his work while a...,,2011-11-17,360.0,9.0,3.0,tl,Siglo ng Pagluluwal,0,0.0,"[{'id': 18, 'name': 'Drama'}]","[{'iso_639_1': 'tl', 'name': ''}]",0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",tt2028550,
44974,67758,Betrayal,"When one of her hits goes wrong, a professiona...",A deadly game of wits.,2003-08-01,90.0,3.8,6.0,en,Betrayal,0,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","[{'iso_639_1': 'en', 'name': 'English'}]",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",tt0303758,
44975,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",,1917-10-21,87.0,0.0,0.0,en,Satana likuyushchiy,0,0.0,[],[],0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",tt0008536,


In [4]:
df["overview"] = df["overview"].astype("string")

In [5]:
df["overview"].dtype

string[python]

In [11]:
overview_series = df["overview"].loc[(df["overview"] != "NaN")]
overview_series.count()

44057

In [7]:
# in general the formula looks something like this

# TF - Term frequency = (number of repetitions of words) / (number of words in sentence) ----- frequency of word in one text 
# IDF - Inverse Document Frequency = log( (number of texts) / (number of texts containing the word) ) ------ inverse frequency of word in all texts, the closer the value to zero is the more often the word appears in all texts
# weighted value = TF * IDF

#in basic terms the more often the words occur the lower the rating, filler words get a low rating this way and can be filtered 
#also contains zero values for every used word in all texts but not occurring in specific text

In [8]:
#initialize the vectorizer object
vectorizer = TfidfVectorizer() #possible keys stop_words = "english"
#this is where there will be created a vector for every value of column["overview"] which contains the "Term Frequency Inverse Document Frequency" for every word

vectors = vectorizer.fit_transform(overview_series.to_list())
#gets example vector for a movie and presents values
first_vector = vectors[10]
dfTEST = pd.DataFrame(first_vector.T.todense(), index= vectorizer.get_feature_names_out(), columns=["tfidf"]).sort_values(by=["tfidf"],ascending=False)

In [9]:
df.iloc[10]

id                                                                    9087
title                                               The American President
overview                 Widowed U.S. president Andrew Shepherd, one of...
tagline                  Why can't the most powerful man in the world h...
release_date                                                    1995-11-17
runtime                                                              106.0
vote_average                                                           6.5
vote_count                                                           199.0
original_language                                                       en
original_title                                      The American President
budget                                                            62000000
revenue                                                        107879496.0
genres                   [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
spoken_languages         

In [10]:
dfTEST

Unnamed: 0,tfidf
shepherd,0.396918
decimate,0.247734
lobbyist,0.243877
covets,0.234953
courting,0.232569
...,...
futuro,0.000000
futurists,0.000000
futuristic,0.000000
futurist,0.000000
