In [None]:
import requests
from bs4 import BeautifulSoup

# Prepare CSV file
topRatedCSV = "topRatedMovies.csv"
sep = ";"
CSV_headers = "tmdb_id"+sep+"title"+sep+"year"+sep+"runtime"+sep+"rating"+sep+"cast"+sep+"genres\n"
f = open(topRatedCSV, "w+")
f.write(CSV_headers)

# Get the top rated movies from the top rated list
def scrapeTopList(headers, url, n_pages):
    topMoviesList = []

    for page in range(1, n_pages+1):
        res_topRated = requests.get(url+'?page='+str(page), headers=headers)
        topRated_html = BeautifulSoup(res_topRated.content, "html.parser")

        print(url+'?page='+str(page), res_topRated.status_code)

        topListHTML = topRated_html.find("div", {"class": "media_items"}).find_all('a', {"class": "image"}) # type: ignore
        
        for title in topListHTML:
            movie_id = topListHTML[topListHTML.index(title)].get('href').split('/')[2]
            topMoviesList.append(movie_id)
    
    return topMoviesList

# Convert runtime to runtime in minutes
def runtime_convertion(movieRuntime):
    movieRuntime = movieRuntime.strip().lower()

    if "h" in movieRuntime and "m" in movieRuntime:
        hours, minutes = movieRuntime.split("h")
        minutes = minutes.split("m")[0]
        runtimeMinutes = int(hours) * 60 + int(minutes)

    elif "h" in movieRuntime:
        hours = movieRuntime.split("h")[0]
        runtimeMinutes = int(hours) * 60

    elif "m" in movieRuntime:
        minutes = movieRuntime.split("m")[0]
        runtimeMinutes = int(minutes)

    else:
        runtimeMinutes = int(movieRuntime)

    return runtimeMinutes

# Scrape movie details from movie page
def scrapeMovieDetails(headers, movie_id):
    url = 'https://www.themoviedb.org/movie/' + movie_id
    res_moviePage = requests.get(url, headers=headers)
    moviePage_html = BeautifulSoup(res_moviePage.content, "html.parser")

    print(url, res_moviePage.status_code)

    movieName = moviePage_html.find('div', {"class": "title"}).find('a').get_text() # type: ignore
    movieYear = moviePage_html.select_one('span.release_date').get_text(strip=True)[-5:-1]
    movieRuntime = runtime_convertion(moviePage_html.find('span', {"class": "runtime"}).get_text().strip().replace('\n', '').replace(' ', ''))
    movieRating = moviePage_html.find('div', {"class": "user_score_chart"}).get('data-percent').split('.')[0]
    movieGenresTags = moviePage_html.find_all('span', {"class": "genres"})[0].find_all('a')
    movieGenresString = ''
    for genre in movieGenresTags:
        movieGenresString += genre.get_text().replace(' ', '_') + '|'
    movieGenresString = movieGenresString[:-1]
    movieCastTags = moviePage_html.find_all('img', {"class": "profile"})
    movieCastString = ''
    for castTag in movieCastTags:
        castName = castTag.get('alt')
        movieCastString += castName + '|'
    movieCastString = movieCastString[:-1]

    print(f'"{movie_id}"; "{movieName}"; {movieYear}; {movieRuntime}; {movieRating}; "{movieCastString}"; "{movieGenresString}"', sep=sep, file=f)

In [None]:
# Scrape top rated movies
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13.3; rv:109.0) Gecko/20100101 Firefox/113.0'}
url = 'https://www.themoviedb.org/movie/top-rated'
n_pages = 5 # 1 page = 20 movies
topList = scrapeTopList(headers, url, n_pages)

In [None]:
# Scrape details from movie page and write to CSV file
for movie_id in topList:
    scrapeMovieDetails(headers, movie_id)

# Close CSV file
f.close()

In [None]:
import pandas as pd

def convert_genres(genres_str):
    return genres_str.split('|')

def convert_cast(genres_str):
    return genres_str.split('|')

df = pd.read_csv('topRatedMovies.csv', sep=';', converters={'genres': convert_genres, 'cast': convert_cast})

df['genre'] = df['genres'].apply(lambda x: list(x))
df['casts'] = df['cast'].apply(lambda x: list(x))

df.drop('genres', axis=1, inplace=True)
df.drop('cast', axis=1, inplace=True)

df

In [None]:
# Lowercase function
def lowercase(tokens):
    return [word.lower() for word in tokens]

# Lowercase
df['genre'] = df['genre'].apply(lowercase)
df['casts'] = df['casts'].apply(lowercase)

df

In [None]:
# Count number of rows with drama in genre
df['genre'].apply(lambda x: 'drama' in x).sum()

# Count number of unique genres in df['genre']
unique_genres = set()
for genres in df['genre']:
    unique_genres.update(genres)
len(unique_genres)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

df['genre_str'] = df['genre'].apply(lambda tokens: ' '.join(tokens))
vectorizer = CountVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(df['genre_str'])

# Count the sum of each word
sum_words = X.sum(axis=0)

# Create a (word, frequency) list and sort it in descending order
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

# Get the 10 most common words
common_words = words_freq[:32]

# Separate words and counts
words, counts = zip(*common_words)

# Plot word frequencies
plt.figure(figsize=(10, 5))
plt.bar(words, counts)
plt.title('10 Most Common Genres in Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Print all unique cast members
unique_cast = set()
for casts in df['casts']:
    unique_cast.update(casts)
unique_cast