In [1]:
#================================================
# DATA
#================================================

import duckdb, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# creating the conection to the duckdb database file:
con = duckdb.connect("movielensfull33M.duckdb")

In [2]:
IMDB_DIR = Path("..") / "data" / "Imdb"
IMDB_DIR

WindowsPath('../data/Imdb')

## 1 Functions to help in the search for all time know Characters
### 1.1 Ad-hoc Query Characters and actors_name by movieID

In [3]:
def search_by_movie_id(movie_id):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*) AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating / 2 AS imdb_avg_rating,
                numVotes AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE a.movieId = ?
        ORDER BY movie_title, actor_name;
    """
    
    return con.execute(query, [movie_id]).df()



### 1.2 Function to search movies by Characters

In [4]:
def search_by_character(character):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*)    AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating/2 AS imdb_avg_rating,
                numVotes      AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE LOWER(a.characters) LIKE LOWER('%' || ? || '%')
        ORDER BY movie_title, actor_name;
    """
    return con.execute(query, [character]).df()



### 1.3 Function to search movies by Characters and movies title

In [5]:
def search_by_character_and_movies(character, movie_title):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*) AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating / 2 AS imdb_avg_rating,
                numVotes AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE LOWER(a.characters) LIKE LOWER('%' || ? || '%')
          AND LOWER(a.title) LIKE LOWER('%' || ? || '%')
        ORDER BY movie_title, actor_name;
    """
    
    return con.execute(query, [character, movie_title]).df()


### 1.4 Function to search movies by Characters and actor name

In [6]:
def search_by_character_and_actor(character, actor_name):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*) AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating / 2 AS imdb_avg_rating,
                numVotes AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE LOWER(a.characters) LIKE LOWER('%' || ? || '%')
          AND LOWER(a.actor_name) LIKE LOWER('%' || ? || '%')
        ORDER BY movie_title, actor_name;
    """
    
    return con.execute(query, [character, actor_name]).df()


### 1.5 Function to summarize functions of the result dataframes

In [7]:
import pandas as pd
import numpy as np

def summarize_character(df, character_name=None):
    """
    Summarize a dataframe (from your search_* functions)
    into a single-row dataframe with weighted averages
    for ML and IMDb ratings.
    """
    if df.empty:
        return pd.DataFrame([{
            "character": character_name,
            "movies_count": 0,
            "character_avg_rating": np.nan,
            "character_rating_count": 0,
            "character_imdb_rating": np.nan,
            "character_imdb_count": 0,
        }])
    
    # If character_name is not provided, try to infer it from the dataframe
    if character_name is None:
        # Take the most common or first character string
        character_name = df["characters"].iloc[0]
    
    # 1) number of distinct movies
    movies_count = df["movieId"].nunique()
    
    # 2) total ML ratings count
    character_rating_count = df["ml_ratings_count"].fillna(0).sum()
    
    # 3) weighted average ML rating
    #    sum(ml_avg_rating * ml_ratings_count) / sum(ml_ratings_count)
    ml_weights = df["ml_ratings_count"].fillna(0)
    ml_values = df["ml_avg_rating"]
    if (ml_weights > 0).any():
        character_avg_rating = (ml_values * ml_weights).sum() / ml_weights.sum()
    else:
        character_avg_rating = np.nan
    
    # 4) total IMDb ratings count
    character_imdb_count = df["imdb_ratings_count"].fillna(0).sum()
    
    # 5) weighted average IMDb rating
    imdb_weights = df["imdb_ratings_count"].fillna(0)
    imdb_values = df["imdb_avg_rating"]
    if (imdb_weights > 0).any():
        character_imdb_rating = (imdb_values * imdb_weights).sum() / imdb_weights.sum()
    else:
        character_imdb_rating = np.nan
    
    # Build single-row dataframe
    summary = pd.DataFrame([{
        "character": character_name,
        "movies_count": movies_count,
        "character_avg_rating": character_avg_rating,
        "character_rating_count": character_rating_count,
        "character_imdb_rating": character_imdb_rating,
        "character_imdb_count": character_imdb_count,
    }])
    
    return summary


## 2 Bigest Heroes Ever
### 2.1 Batman

In [8]:
df=search_by_character_and_movies("batman", "batman")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,176681,Batman,Kevin Conroy,Batman & Harley Quinn (2017),2.84,119,2.95,16303
1,1562,Batman,George Clooney,Batman & Robin (1997),2.19,12649,1.9,279340
2,91054,Batman,Lewis Wilson,Batman (1943),2.95,31,3.0,2691
3,26152,Batman,Adam West,Batman (1966),3.21,1092,3.25,37520
4,592,Batman,Michael Keaton,Batman (1989),3.39,56330,3.75,426506
5,33794,Batman,Christian Bale,Batman Begins (2005),3.92,43300,4.1,1681137
6,167762,Batman,Will Friedle,Batman Beyond Darwyn Cooke's Batman 75th Anniv...,3.35,52,3.9,2670
7,174957,Batman,Kevin Conroy,Batman Beyond: The Movie (1999),3.51,59,3.85,6909
8,174957,Batman,Will Friedle,Batman Beyond: The Movie (1999),3.51,59,3.85,6909
9,153,Batman,Val Kilmer,Batman Forever (1995),2.89,40052,2.75,279001


In [9]:
df_characters_heroes = summarize_character(df)
df_characters_heroes

Unnamed: 0,character,movies_count,character_avg_rating,character_rating_count,character_imdb_rating,character_imdb_count
0,Batman,32,3.288466,184975,3.611133,5272984


### 2.2 Superman

In [10]:
df=search_by_character_and_movies("superman", "superman")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,157631,Superman,Hwang Jung-min,A Man Who Was Superman (2008),3.47,19,3.6,2351
1,157631,Superman - child,Woo-hyuk Choi,A Man Who Was Superman (2008),3.47,19,3.6,2351
2,140415,Superman,Kirk Alyn,Atom Man vs Superman (1950),2.6,5,3.3,923
3,136864,Superman,Henry Cavill,Batman v Superman: Dawn of Justice (2016),2.7,4625,3.2,783257
4,140439,Superman,David Patrick Wilson,"It's A Bird, It's A Plane, It's Superman! (1975)",1.83,3,1.9,483
5,219488,Superman,George Reeves,Stamp Day for Superman (1954),4.0,3,2.7,526
6,140417,Superman,Kirk Alyn,Superman (1948),2.78,9,3.35,1362
7,2640,Superman,Christopher Reeve,Superman (1978),3.38,18453,3.7,204915
8,217461,Superman,Tim Daly,Superman - The Last Son of Krypton (1996),3.83,6,3.8,3405
9,2641,Superman,Christopher Reeve,Superman II (1980),3.1,10622,3.4,123108


In [11]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.3 James Bond

In [12]:
df= search_by_character("james bond")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,258113,James Bond,Bullet Prakash,Bajarangi (2013),3.5,1,3.0,497
1,262133,James Bond,Barry Nelson,Casino Royale (1954),2.5,1,2.8,1659
2,5796,Sir James Bond,David Niven,Casino Royale (1967),2.88,1120,2.5,34337
3,5796,Evelyn Tremble (James Bond - 007),Peter Sellers,Casino Royale (1967),2.88,1120,2.5,34337
4,49272,James Bond,Daniel Craig,Casino Royale (2006),3.84,28517,4.0,730459
5,3984,James Bond,Sean Connery,Diamonds Are Forever (1971),3.5,5992,3.25,120363
6,5872,James Bond,Pierce Brosnan,Die Another Day (2002),3.09,8720,3.05,237366
7,2949,James Bond,Sean Connery,Dr. No (1962),3.67,9694,3.6,190122
8,2989,James Bond,Roger Moore,For Your Eyes Only (1981),3.44,5212,3.35,113874
9,2948,James Bond,Sean Connery,From Russia with Love (1963),3.69,9586,3.65,154339


In [13]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.4 Harry Potter

In [14]:
df=search_by_character_and_movies("harry potter", "harry potter")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,5816,Harry Potter,Daniel Radcliffe,Harry Potter and the Chamber of Secrets (2002),3.65,31004,3.75,747314
1,81834,Harry Potter,Daniel Radcliffe,Harry Potter and the Deathly Hallows: Part 1 (...,3.84,21781,3.85,646989
2,88125,Harry Potter,Daniel Radcliffe,Harry Potter and the Deathly Hallows: Part 2 (...,3.9,20837,4.05,1020949
3,40815,Harry Potter,Daniel Radcliffe,Harry Potter and the Goblet of Fire (2005),3.77,27128,3.85,734105
4,69844,Harry Potter,Daniel Radcliffe,Harry Potter and the Half-Blood Prince (2009),3.83,21849,3.8,643099
5,54001,Harry Potter,Daniel Radcliffe,Harry Potter and the Order of the Phoenix (2007),3.76,21900,3.75,682500
6,8368,Harry Potter,Daniel Radcliffe,Harry Potter and the Prisoner of Azkaban (2004),3.82,32517,3.95,747555
7,4896,Harry Potter,Daniel Radcliffe,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.7,36127,3.85,926050
8,4896,Baby Harry Potter,Saunders Triplets,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.7,36127,3.85,926050


In [15]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.5 Frodo from the Lord of the Rings

In [16]:
df= search_by_character_and_actor("Frodo","Elijah Wood")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,4993,Frodo,Elijah Wood,"Lord of the Rings: The Fellowship of the Ring,...",4.1,79940,4.45,2155025
1,7153,Frodo,Elijah Wood,"Lord of the Rings: The Return of the King, The...",4.11,75512,4.5,2119280
2,5952,Frodo,Elijah Wood,"Lord of the Rings: The Two Towers, The (2002)",4.08,73687,4.4,1912609


In [17]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.6 Ellen Ripley from Alien

In [18]:
df= search_by_character_and_actor("Ripley","Sigourney Weaver")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1214,Ripley,Sigourney Weaver,Alien (1979),4.07,46572,4.25,1041916
1,1690,Ripley,Sigourney Weaver,Alien: Resurrection (1997),3.04,14811,3.1,288107
2,1200,Ripley,Sigourney Weaver,Aliens (1986),4.01,40182,4.2,825997
3,1320,Ripley,Sigourney Weaver,Alien³ (a.k.a. Alien 3) (1992),3.12,17653,3.2,347622


In [19]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.7 John Mcclane from Die Hard

In [20]:
df= search_by_character("John McClane")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,80183,John McClane,Matthew Géczy,8th Wonderland (2008),3.19,8,3.05,522
1,1036,John McClane,Bruce Willis,Die Hard (1988),3.94,47472,4.1,1006333
2,1370,John McClane,Bruce Willis,Die Hard 2 (1990),3.46,20122,3.6,401180
3,165,John McClane,Bruce Willis,Die Hard: With a Vengeance (1995),3.52,43336,3.8,425324
4,100498,John McClane,Bruce Willis,"Good Day to Die Hard, A (2013)",2.55,1571,2.6,221135
5,53972,John McClane,Bruce Willis,Live Free or Die Hard (2007),3.43,8784,3.55,432349


In [21]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.8 Neo from Matrix

In [22]:
df= search_by_character_and_actor("Neo","Keanu Reeves")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,6365,Neo,Keanu Reeves,"Matrix Reloaded, The (2003)",3.38,30788,3.6,661283
1,6934,Neo,Keanu Reeves,"Matrix Revolutions, The (2003)",3.24,24470,3.35,568367
2,2571,Neo,Keanu Reeves,"Matrix, The (1999)",4.16,107056,4.35,2201161


In [23]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

## 3 Bigest Villains Ever
### 3.1 Darth Vader

In [24]:
df= search_by_character_and_movies("darth vader","Star Wars")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,261153,Darth Vader,Matt Sloan,LEGO Star Wars: The Empire Strikes Out (2012),1.13,8,3.55,1737
1,261155,Darth Vader,Phil LaMarr,LEGO Star Wars: The Padawan Menace (2011),2.31,8,3.5,2277
2,136485,Darth Vader,Abraham Benrubi,Robot Chicken: Star Wars (2007),3.48,290,4.0,8869
3,181355,Darth Vader,Abraham Benrubi,Robot Chicken: Star Wars Episode II (2008),3.37,68,4.0,5211
4,181357,Darth Vader,Abraham Benrubi,Robot Chicken: Star Wars Episode III (2010),3.6,97,4.0,4921
5,260,Darth Vader,David Prowse,Star Wars: Episode IV - A New Hope (1977),4.09,97202,4.3,1540245
6,1196,Darth Vader,David Prowse,Star Wars: Episode V - The Empire Strikes Back...,4.12,80200,4.35,1473937
7,1210,Darth Vader,James Earl Jones,Star Wars: Episode VI - Return of the Jedi (1983),3.98,76773,4.15,1187674
8,229523,Darth Vader,Jack Foley,Star Wars: Revelations,2.0,4,2.5,1207
9,229523,Darth Vader,Kevin Zabawa,Star Wars: Revelations,2.0,4,2.5,1207


In [25]:
df_characters_vilains = summarize_character(df)


### 3.2 Hannibal Lecter

In [26]:
df= search_by_character_and_actor("lecter","Anthony Hopkins")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,4148,Hannibal Lecter,Anthony Hopkins,Hannibal (2001),3.24,12067,3.4,309633
1,5630,Hannibal Lecter,Anthony Hopkins,Red Dragon (2002),3.56,9272,3.6,307694
2,593,Dr. Hannibal Lecter,Anthony Hopkins,"Silence of the Lambs, The (1991)",4.15,101802,4.3,1677627


In [27]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.3 Joker

In [28]:
df1= search_by_character_and_movies("joker","batman")
df2= search_by_character_and_actor("joker","Heath Ledger")

df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,26152,The Joker,Cesar Romero,Batman (1966),3.21,1092,3.25,37520
1,592,Joker,Jack Nicholson,Batman (1989),3.39,56330,3.75,426506
2,186985,The Joker,Wataru Takagi,Batman Ninja (2018),2.96,199,2.8,23798
3,140115,The Joker,Troy Baker,Batman Unlimited: Monster Mayhem (2015),2.09,40,2.8,3349
4,202099,Joker,Troy Baker,Batman vs. Teenage Mutant Ninja Turtles (2019),3.38,85,3.55,13715
5,178997,Joker,Jeff Bergman,Batman vs. Two-Face (2017),2.66,53,3.1,4933
6,182613,The Joker,Andrew Koenig,Batman: Dead End (2003),3.09,23,3.6,6371
7,165085,The Joker,Jeff Bergman,Batman: Return of the Caped Crusaders (2016),2.88,76,3.35,6881
8,161354,The Joker,Mark Hamill,Batman: The Killing Joke (2016),3.0,524,3.2,64770
9,165153,Joker,John DiMaggio,LEGO DC Comics Super Heroes: Batman: Be-League...,2.63,15,3.2,1777


In [29]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.4 Norman Bates

In [30]:
df= search_by_character("norman bates")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,202171,Norman Bates,Kurt Paul,Bates Motel (1987),2.28,9,1.9,2007.0
1,1219,Norman Bates,Anthony Perkins,Psycho (1960),4.06,28016,4.25,770757.0
2,2389,Norman Bates,Vince Vaughn,Psycho (1998),2.81,3704,2.3,52348.0
3,2902,Norman Bates,Anthony Perkins,Psycho II (1983),2.59,1342,3.3,34296.0
4,2903,Norman Bates,Anthony Perkins,Psycho III (1986),2.13,858,2.75,18016.0
5,184071,Norman Bates,Anthony Perkins,Psycho IV: The Beginning (1990),2.52,29,2.7,10739.0
6,211966,madre de Norman Bates,Silvia Gambino,WHAT DID JACK DO? (2017),3.28,130,,
7,161014,Norman Bates,Scott McGinnis,Wacko (1982),2.61,9,2.45,1604.0


In [31]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.5 Chucky

In [32]:
df= search_by_character_and_actor("Chucky","Brad Dourif")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,2315,Chucky,Brad Dourif,Bride of Chucky (Child's Play 4) (1998),2.21,2359,2.8,69145
1,1991,Chucky,Brad Dourif,Child's Play (1988),2.87,3464,3.35,129003
2,1992,Chucky,Brad Dourif,Child's Play 2 (1990),2.37,1562,3.0,63092
3,1993,Chucky,Brad Dourif,Child's Play 3 (1991),2.11,1249,2.6,50316
4,178447,Chucky,Brad Dourif,Cult of Chucky (2017),2.57,157,2.65,33288
5,8967,Chucky,Brad Dourif,Seed of Chucky (Child's Play 5) (2004),2.24,626,2.45,53576


In [33]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.6 Cruella de De vil

In [34]:
df= search_by_character("Cruella")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1367,Cruella DeVil,Glenn Close,101 Dalmatians (1996),3.05,11168,2.9,122775
1,2085,Cruella De Vil,Betty Lou Gerson,101 Dalmatians (One Hundred and One Dalmatians...,3.43,10747,3.65,193075
2,121099,Cruella,Susanne Blakeslee,101 Dalmatians II: Patch's London Adventure (2...,2.87,129,2.85,11773
3,3991,Cruella de Vil,Glenn Close,102 Dalmatians (2000),2.38,2288,2.45,41347
4,249540,Cruella,Emma Stone,Cruella (2021),3.5,990,3.65,288845
5,174535,Cruella De Vil,Susanne Blakeslee,Mickey's House of Villains (2001),2.64,33,3.3,4536


In [35]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.7 Michael Corleone from the Godfather

In [36]:
df= search_by_character_and_actor("michael","pacino")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,858,Michael,Al Pacino,"Godfather, The (1972)",4.33,75004,4.6,2175958
1,1221,Michael,Al Pacino,"Godfather: Part II, The (1974)",4.27,47271,4.5,1462039
2,2023,Michael Corleone,Al Pacino,"Godfather: Part III, The (1990)",3.45,14446,3.8,447187


In [37]:
summary_df = summarize_character(df,"Michael Corleone")
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.8 Loki da serie Thor

In [38]:
df= search_by_character_and_actor("loki","Tom Hiddleston")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,89745,Loki,Tom Hiddleston,"Avengers, The (2012)",3.74,27495,4.0,1533308
1,86332,Loki,Tom Hiddleston,Thor (2011),3.32,14900,3.5,950245
2,122916,Loki,Tom Hiddleston,Thor: Ragnarok (2017),3.9,14231,3.95,876024
3,106072,Loki,Tom Hiddleston,Thor: The Dark World (2013),3.19,8374,3.35,765034


In [39]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

## 4 Competitions
### 4.1 Summary of characteres in competition
#### 4.1.1 Heroes

In [40]:
df_characters_heroes

Unnamed: 0,character,movies_count,character_avg_rating,character_rating_count,character_imdb_rating,character_imdb_count
0,Batman,32,3.288466,184975,3.611133,5272984
1,Superman,17,2.980822,50456,3.177828,1602564
2,James Bond,31,3.485162,233408,3.520106,6134640
3,Harry Potter,8,3.762671,249270,3.864673,7074611
4,Frodo,3,4.096864,229139,4.45167,6186914
5,Ripley,4,3.781146,119218,3.955379,2503642
6,John McClane,6,3.655324,121293,3.738807,2486843
7,Neo,3,3.873352,162314,4.039773,3430811


#### 4.1.1 Vilains

In [41]:
df_characters_vilains

Unnamed: 0,character,movies_count,character_avg_rating,character_rating_count,character_imdb_rating,character_imdb_count
0,Darth Vader,10,4.062404,254902,4.258672,4245356
1,Hannibal Lecter,3,4.016402,123141,4.084721,2294954
2,The Joker,14,3.802336,126184,4.35675,3889184
3,Norman Bates,8,3.812652,34097,4.041032,889767
4,Chucky,6,2.47405,9417,2.924898,398420
5,Cruella DeVil,6,3.166728,25355,3.419452,662351
6,Michael Corleone,3,4.216274,136721,4.476639,4085184
7,Loki,4,3.607896,65000,3.753626,4124611


### 4.2 Auxiliary functions to deal with the "competition"

In [42]:
# ---- Helper functions for scores ----

def total_ratings(row):
    """Total number of ratings (MovieLens + IMDb)."""
    return row["character_rating_count"] + row["character_imdb_count"]

def weighted_global_rating(row):
    """Weighted average of MovieLens and IMDb ratings."""
    ml_count = row["character_rating_count"]
    imdb_count = row["character_imdb_count"]
    ml_rating = row["character_avg_rating"]
    imdb_rating = row["character_imdb_rating"]
    
    total_count = ml_count + imdb_count
    if total_count == 0:
        return 0.0
    return (ml_rating * ml_count + imdb_rating * imdb_count) / total_count

def product_score(row):
    """Score based on rating * count for both MovieLens and IMDb."""
    ml_part = row["character_avg_rating"] * row["character_rating_count"]
    imdb_part = row["character_imdb_rating"] * row["character_imdb_count"]
    return ml_part + imdb_part


# ---- Function to play one knockout round ----

def play_round(df, score_func, round_name="Round", random_state=42, side_label=""):
    """
    Play a knockout round:
        - Shuffle contestants
        - Pair them 1 vs 1
        - Winner decided by score_func(row)
    Returns:
        winners_df, matches (list of dicts with match results)
    """
    # Shuffle to randomize matchups
    shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    winners = []
    matches = []

    # We assume an even number of rows (8, 4, 2, etc.)
    for i in range(0, len(shuffled), 2):
        p1 = shuffled.iloc[i]
        p2 = shuffled.iloc[i + 1]
        
        s1 = score_func(p1)
        s2 = score_func(p2)
        
        # Decide winner (tie goes to p1)
        winner = p1 if s1 >= s2 else p2
        
        matches.append({
            "round": round_name,
            "side": side_label,
            "character_1": p1["character"],
            "score_1": s1,
            "character_2": p2["character"],
            "score_2": s2,
            "winner": winner["character"],
        })
        
        winners.append(winner)

    winners_df = pd.DataFrame(winners).reset_index(drop=True)
    return winners_df, matches


# ---- Main tournament function ----

def run_tournament(df_characters_heroes, df_characters_vilains, base_seed=42):
    """
    Run the heroes vs villains tournament.
    
    Input:
        df_characters_heroes  - dataframe with 8 heroes
        df_characters_vilains - dataframe with 8 villains
        
    Returns:
        results: dict with
            - "matches": list of all match dicts
            - "hero_champion": row (Series) of hero champion
            - "villain_champion": row (Series) of villain champion
            - "grand_final": dict with final match info
    """
    all_matches = []
    
    # ---------- HEROES BRACKET ----------
    # Round 1: 8 -> 4 (by total ratings)
    heroes_r1_winners, matches = play_round(
        df_characters_heroes,
        score_func=total_ratings,
        round_name="Heroes R1 (total ratings)",
        random_state=base_seed,
        side_label="heroes"
    )
    all_matches.extend(matches)
    
    # Round 2: 4 -> 2 (by weighted global rating)
    heroes_r2_winners, matches = play_round(
        heroes_r1_winners,
        score_func=weighted_global_rating,
        round_name="Heroes R2 (weighted rating)",
        random_state=base_seed + 1,
        side_label="heroes"
    )
    all_matches.extend(matches)
    
    # Round 3: 2 -> 1 champion (by product score)
    heroes_champion_df, matches = play_round(
        heroes_r2_winners,
        score_func=product_score,
        round_name="Heroes Final (product score)",
        random_state=base_seed + 2,
        side_label="heroes"
    )
    all_matches.extend(matches)
    hero_champion = heroes_champion_df.iloc[0]
    
    # ---------- VILLAINS BRACKET ----------
    villains_r1_winners, matches = play_round(
        df_characters_vilains,
        score_func=total_ratings,
        round_name="Villains R1 (total ratings)",
        random_state=base_seed,
        side_label="villains"
    )
    all_matches.extend(matches)
    
    villains_r2_winners, matches = play_round(
        villains_r1_winners,
        score_func=weighted_global_rating,
        round_name="Villains R2 (weighted rating)",
        random_state=base_seed + 1,
        side_label="villains"
    )
    all_matches.extend(matches)
    
    villains_champion_df, matches = play_round(
        villains_r2_winners,
        score_func=product_score,
        round_name="Villains Final (product score)",
        random_state=base_seed + 2,
        side_label="villains"
    )
    all_matches.extend(matches)
    villain_champion = villains_champion_df.iloc[0]
    
    # ---------- GRAND FINAL ----------
    # Hero champion vs Villain champion using product_score again
    grand_final_contestants = pd.DataFrame([hero_champion, villain_champion]).reset_index(drop=True)
    gf_winners_df, gf_matches = play_round(
        grand_final_contestants,
        score_func=product_score,
        round_name="Grand Final (Hero vs Villain)",
        random_state=base_seed + 3,
        side_label="grand_final"
    )
    all_matches.extend(gf_matches)
    
    grand_final = gf_matches[0]  # only one match
    grand_champion = gf_winners_df.iloc[0]
    
    results = {
        "matches": all_matches,
        "hero_champion": hero_champion,
        "villain_champion": villain_champion,
        "grand_final": grand_final,
        "grand_champion": grand_champion,
    }
    
    return results


### 4.3 Figth!

In this chapter the competition is triggerd by the lauch of the of the sub "run_tournament"

In [43]:
results = run_tournament(df_characters_heroes, df_characters_vilains, base_seed=42)

### 4.4 Competiton results

#### 4.4.1 All rounds

In [44]:
# All matches as a DataFrame
matches_df = pd.DataFrame(results["matches"])
matches_df

Unnamed: 0,round,side,character_1,score_1,character_2,score_2,winner
0,Heroes R1 (total ratings),heroes,Superman,1653020.0,Ripley,2622860.0,Ripley
1,Heroes R1 (total ratings),heroes,Batman,5457959.0,Neo,3593125.0,Batman
2,Heroes R1 (total ratings),heroes,James Bond,6368048.0,Frodo,6416053.0,Frodo
3,Heroes R1 (total ratings),heroes,Harry Potter,7323881.0,John McClane,2608136.0,Harry Potter
4,Heroes R2 (weighted rating),heroes,Frodo,4.438999,Batman,3.600198,Frodo
5,Heroes R2 (weighted rating),heroes,Harry Potter,3.861202,Ripley,3.947459,Ripley
6,Heroes Final (product score),heroes,Ripley,10353630.0,Frodo,28480850.0,Frodo
7,Villains R1 (total ratings),villains,Hannibal Lecter,2418095.0,Cruella DeVil,687706.0,Hannibal Lecter
8,Villains R1 (total ratings),villains,Darth Vader,4500258.0,Loki,4189611.0,Darth Vader
9,Villains R1 (total ratings),villains,The Joker,4015368.0,Chucky,407837.0,The Joker


#### 4.4.2 Results of the hero champion

In [45]:
# Campeão dos heróis
results["hero_champion"]

character                    Frodo
movies_count                     3
character_avg_rating      4.096864
character_rating_count      229139
character_imdb_rating      4.45167
character_imdb_count       6186914
Name: 0, dtype: object

#### 4.4.3 Results of the Vilain Champion

In [46]:
# Campeão dos vilões
results["villain_champion"]

character                 Michael Corleone
movies_count                             3
character_avg_rating              4.216274
character_rating_count              136721
character_imdb_rating             4.476639
character_imdb_count               4085184
Name: 0, dtype: object

#### 4.4.4 Grand Finale

In [47]:
# Finalíssima (detalhe do combate final)
results["grand_final"]

{'round': 'Grand Final (Hero vs Villain)',
 'side': 'grand_final',
 'character_1': 'Frodo',
 'score_1': np.float64(28480852.130000006),
 'character_2': 'Michael Corleone',
 'score_2': np.float64(18864346.09),
 'winner': 'Frodo'}

#### 4.4.5 Final result

In [48]:
# Grande campeão absoluto
results["grand_champion"]

character                    Frodo
movies_count                     3
character_avg_rating      4.096864
character_rating_count      229139
character_imdb_rating      4.45167
character_imdb_count       6186914
Name: 0, dtype: object

## 5 Close conection to duckdb

In [49]:
con.close()
print("Ligação fechada.")

Ligação fechada.


## 6 GARBAGE TO DELETE

### 2.5 Mr. Incredible from the Incredibles

In [50]:
#df= search_by_character("Mr. incredible")
#df

In [51]:
#summary_df = summarize_character(df)
#df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)
#df_characters_heroes

### 2.6 Indiana Jones

In [52]:
#df= search_by_character_and_actor("indiana Jones","Harrison Ford")
#df

In [53]:
#summary_df = summarize_character(df)
#df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)
#df_characters_heroes