In [1248]:
#================================================
# DATA
#================================================

import duckdb, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# creating the conection to the duckdb database file:
con = duckdb.connect("movielens100K.duckdb")

In [1249]:
IMDB_DIR = Path("..") / "data" / "Imdb"
IMDB_DIR

WindowsPath('../data/Imdb')

## 1 Functions to help in the search for all time know Characters
### 1.1 Ad-hoc Query Characters and actors_name by movieID

In [1250]:
def search_by_movie_id(movie_id):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*) AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating / 2 AS imdb_avg_rating,
                numVotes AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE a.movieId = ?
        ORDER BY movie_title, actor_name;
    """
    
    return con.execute(query, [movie_id]).df()



### 1.2 Function to search movies by Characters

In [1251]:
def search_by_character(character):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*)    AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating/2 AS imdb_avg_rating,
                numVotes      AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE LOWER(a.characters) LIKE LOWER('%' || ? || '%')
        ORDER BY movie_title, actor_name;
    """
    return con.execute(query, [character]).df()



### 1.3 Function to search movies by Characters and movies title

In [1252]:
def search_by_character_and_movies(character, movie_title):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*) AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating / 2 AS imdb_avg_rating,
                numVotes AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE LOWER(a.characters) LIKE LOWER('%' || ? || '%')
          AND LOWER(a.title) LIKE LOWER('%' || ? || '%')
        ORDER BY movie_title, actor_name;
    """
    
    return con.execute(query, [character, movie_title]).df()


### 1.4 Function to search movies by Characters and actor name

In [1253]:
def search_by_character_and_actor(character, actor_name):
    query = """
        WITH ml_ratings AS (
            SELECT
                movieId,
                AVG(rating) AS ml_avg_rating,
                COUNT(*) AS ml_ratings_count
            FROM ratings
            GROUP BY movieId
        ),
        imdb_ratings AS (
            SELECT
                movieId,
                averageRating / 2 AS imdb_avg_rating,
                numVotes AS imdb_ratings_count
            FROM movielens_ratings_imdb
        )
        SELECT
            a.movieId,
            a.characters,
            a.actor_name,
            a.title AS movie_title,
            ROUND(mr.ml_avg_rating, 2) AS ml_avg_rating,
            mr.ml_ratings_count,
            ir.imdb_avg_rating,
            ir.imdb_ratings_count
        FROM movielens_actors AS a
        LEFT JOIN ml_ratings AS mr
            ON mr.movieId = a.movieId
        LEFT JOIN imdb_ratings AS ir
            ON ir.movieId = a.movieId
        WHERE LOWER(a.characters) LIKE LOWER('%' || ? || '%')
          AND LOWER(a.actor_name) LIKE LOWER('%' || ? || '%')
        ORDER BY movie_title, actor_name;
    """
    
    return con.execute(query, [character, actor_name]).df()


### 1.5 Function to summarize functions of the result dataframes

In [1254]:
import pandas as pd
import numpy as np

def summarize_character(df, character_name=None):
    """
    Summarize a dataframe (from your search_* functions)
    into a single-row dataframe with weighted averages
    for ML and IMDb ratings.
    """
    if df.empty:
        return pd.DataFrame([{
            "character": character_name,
            "movies_count": 0,
            "character_avg_rating": np.nan,
            "character_rating_count": 0,
            "character_imdb_rating": np.nan,
            "character_imdb_count": 0,
        }])
    
    # If character_name is not provided, try to infer it from the dataframe
    if character_name is None:
        # Take the most common or first character string
        character_name = df["characters"].iloc[0]
    
    # 1) number of distinct movies
    movies_count = df["movieId"].nunique()
    
    # 2) total ML ratings count
    character_rating_count = df["ml_ratings_count"].fillna(0).sum()
    
    # 3) weighted average ML rating
    #    sum(ml_avg_rating * ml_ratings_count) / sum(ml_ratings_count)
    ml_weights = df["ml_ratings_count"].fillna(0)
    ml_values = df["ml_avg_rating"]
    if (ml_weights > 0).any():
        character_avg_rating = (ml_values * ml_weights).sum() / ml_weights.sum()
    else:
        character_avg_rating = np.nan
    
    # 4) total IMDb ratings count
    character_imdb_count = df["imdb_ratings_count"].fillna(0).sum()
    
    # 5) weighted average IMDb rating
    imdb_weights = df["imdb_ratings_count"].fillna(0)
    imdb_values = df["imdb_avg_rating"]
    if (imdb_weights > 0).any():
        character_imdb_rating = (imdb_values * imdb_weights).sum() / imdb_weights.sum()
    else:
        character_imdb_rating = np.nan
    
    # Build single-row dataframe
    summary = pd.DataFrame([{
        "character": character_name,
        "movies_count": movies_count,
        "character_avg_rating": character_avg_rating,
        "character_rating_count": character_rating_count,
        "character_imdb_rating": character_imdb_rating,
        "character_imdb_count": character_imdb_count,
    }])
    
    return summary


## 2 Bigest Heroes Ever
### 2.1 Batman

In [1255]:
df=search_by_character_and_movies("batman", "batman")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1562,Batman,George Clooney,Batman & Robin (1997),2.21,42,1.9,279184
1,26152,Batman,Adam West,Batman (1966),3.0,5,3.25,37499
2,592,Batman,Michael Keaton,Batman (1989),3.43,189,3.75,426194
3,33794,Batman,Christian Bale,Batman Begins (2005),3.86,116,4.1,1679844
4,27311,Batman,Will Friedle,Batman Beyond: Return of the Joker (2000),3.5,3,3.85,31584
5,153,Batman,Val Kilmer,Batman Forever (1995),2.92,137,2.75,278849
6,1377,Batman,Michael Keaton,Batman Returns (1992),3.03,60,3.55,345431
7,136864,Batman,Ben Affleck,Batman v Superman: Dawn of Justice (2016),2.34,16,3.2,782923
8,131739,Batman,Jason O'Mara,Batman vs. Robin (2015),3.0,1,3.55,27809
9,27155,Batman,Kevin Conroy,"Batman/Superman Movie, The (1998)",2.0,1,3.85,10579


In [1256]:
df_characters_heroes = summarize_character(df)
df_characters_heroes

Unnamed: 0,character,movies_count,character_avg_rating,character_rating_count,character_imdb_rating,character_imdb_count
0,Batman,22,3.273393,619,3.585496,4573403


### 2.2 Superman

In [1257]:
df=search_by_character_and_movies("superman", "superman")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,84414,Superman,James Denton,All-Star Superman (2011),4.25,2,3.4,17873
1,136864,Superman,Henry Cavill,Batman v Superman: Dawn of Justice (2016),2.34,16,3.2,782923
2,27155,Superman,Tim Daly,"Batman/Superman Movie, The (1998)",2.0,1,3.85,10579
3,2640,Superman,Christopher Reeve,Superman (1978),3.61,61,3.7,204761
4,2641,Superman,Christopher Reeve,Superman II (1980),3.02,49,3.4,123048
5,2642,Superman,Christopher Reeve,Superman III (1983),2.25,22,2.5,80281
6,2643,Superman,Christopher Reeve,Superman IV: The Quest for Peace (1987),1.69,16,1.9,57541
7,46530,Superman,Brandon Routh,Superman Returns (2006),3.08,25,3.05,301022
8,95149,Superman,Tim Daly,Superman/Batman: Public Enemies (2009),5.0,1,3.55,26904
9,95004,Superman,Adam Baldwin,Superman/Doomsday (2007),4.0,1,3.45,24580


In [1258]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.3 James Bond

In [1259]:
df= search_by_character("james bond")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,5796,Sir James Bond,David Niven,Casino Royale (1967),2.5,5,2.5,34312
1,5796,Evelyn Tremble (James Bond - 007),Peter Sellers,Casino Royale (1967),2.5,5,2.5,34312
2,49272,James Bond,Daniel Craig,Casino Royale (2006),3.94,81,4.0,729832
3,3984,James Bond,Sean Connery,Diamonds Are Forever (1971),3.5,21,3.25,120225
4,5872,James Bond,Pierce Brosnan,Die Another Day (2002),2.93,27,3.05,237203
5,2949,James Bond,Sean Connery,Dr. No (1962),3.71,33,3.6,189896
6,2989,James Bond,Roger Moore,For Your Eyes Only (1981),3.55,21,3.35,113752
7,2948,James Bond,Sean Connery,From Russia with Love (1963),3.75,36,3.65,154124
8,10,James Bond,Pierce Brosnan,GoldenEye (1995),3.5,132,3.6,281209
9,2947,James Bond,Sean Connery,Goldfinger (1964),3.84,53,3.85,212708


In [1260]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.4 Harry Potter

In [1261]:
df=search_by_character_and_movies("harry potter", "harry potter")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,5816,Harry Potter,Daniel Radcliffe,Harry Potter and the Chamber of Secrets (2002),3.6,102,3.75,746380
1,81834,Harry Potter,Daniel Radcliffe,Harry Potter and the Deathly Hallows: Part 1 (...,3.99,47,3.85,646233
2,88125,Harry Potter,Daniel Radcliffe,Harry Potter and the Deathly Hallows: Part 2 (...,3.91,50,4.05,1019944
3,40815,Harry Potter,Daniel Radcliffe,Harry Potter and the Goblet of Fire (2005),3.82,71,3.85,733319
4,69844,Harry Potter,Daniel Radcliffe,Harry Potter and the Half-Blood Prince (2009),3.89,58,3.8,642414
5,54001,Harry Potter,Daniel Radcliffe,Harry Potter and the Order of the Phoenix (2007),3.86,58,3.75,681723
6,8368,Harry Potter,Daniel Radcliffe,Harry Potter and the Prisoner of Azkaban (2004),3.91,93,3.95,746697
7,4896,Harry Potter,Daniel Radcliffe,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.76,107,3.85,924808
8,4896,Baby Harry Potter,Saunders Triplets,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.76,107,3.85,924808


In [1262]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.5 Frodo from the Lord of the Rings

In [1263]:
df= search_by_character_and_actor("Frodo","Elijah Wood")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,4993,Frodo,Elijah Wood,"Lord of the Rings: The Fellowship of the Ring,...",4.11,198,4.45,2152498
1,7153,Frodo,Elijah Wood,"Lord of the Rings: The Return of the King, The...",4.12,185,4.5,2117156
2,5952,Frodo,Elijah Wood,"Lord of the Rings: The Two Towers, The (2002)",4.02,188,4.4,1910440


In [1264]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.6 Ellen Ripley from Alien

In [1265]:
df= search_by_character_and_actor("Ripley","Sigourney Weaver")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1214,Ripley,Sigourney Weaver,Alien (1979),3.97,146,4.25,1040401
1,1690,Ripley,Sigourney Weaver,Alien: Resurrection (1997),2.92,45,3.1,287853
2,1200,Ripley,Sigourney Weaver,Aliens (1986),3.96,126,4.2,824656
3,1320,Ripley,Sigourney Weaver,Alien³ (a.k.a. Alien 3) (1992),3.15,47,3.2,347338


In [1266]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.7 John Mcclane from Die Hard

In [1267]:
df= search_by_character("John McClane")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1036,John McClane,Bruce Willis,Die Hard (1988),3.86,145,4.1,1005560
1,1370,John McClane,Bruce Willis,Die Hard 2 (1990),3.4,68,3.6,400971
2,165,John McClane,Bruce Willis,Die Hard: With a Vengeance (1995),3.56,144,3.8,425108
3,100498,John McClane,Bruce Willis,"Good Day to Die Hard, A (2013)",2.08,6,2.6,221078
4,53972,John McClane,Bruce Willis,Live Free or Die Hard (2007),3.41,32,3.55,432223


In [1268]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

### 2.8 Neo from Matrix

In [1269]:
df= search_by_character_and_actor("Neo","Keanu Reeves")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,6365,Neo,Keanu Reeves,"Matrix Reloaded, The (2003)",3.35,96,3.6,660874
1,6934,Neo,Keanu Reeves,"Matrix Revolutions, The (2003)",3.15,79,3.35,568043
2,2571,Neo,Keanu Reeves,"Matrix, The (1999)",4.19,278,4.35,2198642


In [1270]:
summary_df = summarize_character(df)
df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)

## 3 Bigest Villains Ever
### 3.1 Darth Vader

In [1271]:
df= search_by_character_and_movies("darth vader","Star Wars")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,260,Darth Vader,David Prowse,Star Wars: Episode IV - A New Hope (1977),4.23,251,4.3,1538496
1,1196,Darth Vader,David Prowse,Star Wars: Episode V - The Empire Strikes Back...,4.22,211,4.35,1472190
2,1210,Darth Vader,James Earl Jones,Star Wars: Episode VI - Return of the Jedi (1983),4.14,196,4.15,1186668
3,135216,Darth Vader,James Earl Jones,The Star Wars Holiday Special (1978),0.5,1,1.1,18052


In [1272]:
df_characters_vilains = summarize_character(df)


### 3.2 Hannibal Lecter

In [1273]:
df= search_by_character_and_actor("lecter","Anthony Hopkins")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,4148,Hannibal Lecter,Anthony Hopkins,Hannibal (2001),2.94,39,3.4,309421
1,5630,Hannibal Lecter,Anthony Hopkins,Red Dragon (2002),3.44,31,3.6,307464
2,593,Dr. Hannibal Lecter,Anthony Hopkins,"Silence of the Lambs, The (1991)",4.16,279,4.3,1675121


In [1274]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.3 Joker

In [1275]:
df1= search_by_character_and_movies("joker","batman")
df2= search_by_character_and_actor("joker","Heath Ledger")

df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,26152,The Joker,Cesar Romero,Batman (1966),3.0,5,3.25,37499
1,592,Joker,Jack Nicholson,Batman (1989),3.43,189,3.75,426194
2,27311,The Joker,Mark Hamill,Batman Beyond: Return of the Joker (2000),3.5,3,3.85,31584
3,27155,The Joker,Mark Hamill,"Batman/Superman Movie, The (1998)",2.0,1,3.85,10579
4,113278,Joker,Troy Baker,Batman: Assault on Arkham (2014),3.0,1,3.7,39500
5,99813,Joker,Michael Emerson,"Batman: The Dark Knight Returns, Part 2 (2013)",3.88,8,4.15,60437
6,161354,The Joker,Mark Hamill,Batman: The Killing Joke (2016),3.0,1,3.2,64721
7,79274,Joker,John DiMaggio,Batman: Under the Red Hood (2010),3.67,3,4.0,72625
8,103233,The Joker,Christopher Corey Smith,LEGO Batman: The Movie - DC Heroes Unite (2013),3.0,1,3.2,6107
9,167746,Joker,Zach Galifianakis,The Lego Batman Movie (2017),3.71,7,3.65,187623


In [1276]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.4 Norman Bates

In [1277]:
df= search_by_character("norman bates")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1219,Norman Bates,Anthony Perkins,Psycho (1960),4.04,83,4.25,769531
1,2389,Norman Bates,Vince Vaughn,Psycho (1998),1.89,9,2.3,52328
2,2902,Norman Bates,Anthony Perkins,Psycho II (1983),2.5,4,3.3,34265
3,2903,Norman Bates,Anthony Perkins,Psycho III (1986),2.5,2,2.75,17997


In [1278]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.5 Chucky

In [1279]:
df= search_by_character_and_actor("Chucky","Brad Dourif")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,2315,Chucky,Brad Dourif,Bride of Chucky (Child's Play 4) (1998),2.17,6,2.8,69094
1,1991,Chucky,Brad Dourif,Child's Play (1988),2.71,12,3.35,128788
2,1992,Chucky,Brad Dourif,Child's Play 2 (1990),2.38,4,3.0,63013
3,1993,Chucky,Brad Dourif,Child's Play 3 (1991),1.8,5,2.6,50271
4,8967,Chucky,Brad Dourif,Seed of Chucky (Child's Play 5) (2004),1.75,4,2.45,53532


In [1280]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.6 Cruella de De vil

In [1281]:
df= search_by_character("Cruella")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,1367,Cruella DeVil,Glenn Close,101 Dalmatians (1996),3.07,47,2.9,122696
1,2085,Cruella De Vil,Betty Lou Gerson,101 Dalmatians (One Hundred and One Dalmatians...,3.43,44,3.65,192916
2,121099,Cruella,Susanne Blakeslee,101 Dalmatians II: Patch's London Adventure (2...,2.5,1,2.85,11760
3,3991,Cruella de Vil,Glenn Close,102 Dalmatians (2000),2.78,9,2.45,41319


In [1282]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.7 Michael Corleone from the Godfather

In [1283]:
df= search_by_character_and_actor("michael","pacino")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,858,Michael,Al Pacino,"Godfather, The (1972)",4.29,192,4.6,2173294
1,1221,Michael,Al Pacino,"Godfather: Part II, The (1974)",4.26,129,4.5,1460084
2,2023,Michael Corleone,Al Pacino,"Godfather: Part III, The (1990)",3.36,45,3.8,446783


In [1284]:
summary_df = summarize_character(df,"Michael Corleone")
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

### 3.8 Loki da serie Thor

In [1285]:
df= search_by_character_and_actor("loki","Tom Hiddleston")
df

Unnamed: 0,movieId,characters,actor_name,movie_title,ml_avg_rating,ml_ratings_count,imdb_avg_rating,imdb_ratings_count
0,89745,Loki,Tom Hiddleston,"Avengers, The (2012)",3.87,69,4.0,1532467
1,86332,Loki,Tom Hiddleston,Thor (2011),3.51,34,3.5,949663
2,122916,Loki,Tom Hiddleston,Thor: Ragnarok (2017),4.03,20,3.95,875233
3,106072,Loki,Tom Hiddleston,Thor: The Dark World (2013),3.31,21,3.35,764561


In [1286]:
summary_df = summarize_character(df)
df_characters_vilains = pd.concat([df_characters_vilains, summary_df], ignore_index=True)

## 4 Competitions
### 4.1 Summary of characteres in competition
#### 4.1.1 Heroes

In [1287]:
df_characters_heroes

Unnamed: 0,character,movies_count,character_avg_rating,character_rating_count,character_imdb_rating,character_imdb_count
0,Batman,22,3.273393,619,3.585496,4573403
1,Superman,10,2.988051,195,3.189719,1654092
2,Sir James Bond,26,3.50513,807,3.509532,5650867
3,Harry Potter,8,3.808398,693,3.864679,7066326
4,Frodo,3,4.083608,571,4.451672,6180094
5,Ripley,4,3.730852,364,3.955242,2500248
6,John McClane,5,3.607949,395,3.738882,2484940
7,Neo,3,3.830618,453,4.039663,3427559


#### 4.1.1 Vilains

In [1288]:
df_characters_vilains

Unnamed: 0,character,movies_count,character_avg_rating,character_rating_count,character_imdb_rating,character_imdb_count
0,Darth Vader,4,4.19437,659,4.261532,4215406
1,Hannibal Lecter,3,3.959713,349,4.084598,2292006
2,The Joker,11,3.762364,368,4.356088,4029620
3,Norman Bates,4,3.748265,98,4.065144,874121
4,Chucky,5,2.292258,31,2.949838,364698
5,Cruella DeVil,4,3.195347,101,3.240408,368691
6,Michael Corleone,3,4.165082,366,4.476614,4080161
7,Loki,4,3.725556,144,3.75362,4121924


### 4.2 Auxiliary functions to deal with the "competition"

In [1289]:
# ---- Helper functions for scores ----

def total_ratings(row):
    """Total number of ratings (MovieLens + IMDb)."""
    return row["character_rating_count"] + row["character_imdb_count"]

def weighted_global_rating(row):
    """Weighted average of MovieLens and IMDb ratings."""
    ml_count = row["character_rating_count"]
    imdb_count = row["character_imdb_count"]
    ml_rating = row["character_avg_rating"]
    imdb_rating = row["character_imdb_rating"]
    
    total_count = ml_count + imdb_count
    if total_count == 0:
        return 0.0
    return (ml_rating * ml_count + imdb_rating * imdb_count) / total_count

def product_score(row):
    """Score based on rating * count for both MovieLens and IMDb."""
    ml_part = row["character_avg_rating"] * row["character_rating_count"]
    imdb_part = row["character_imdb_rating"] * row["character_imdb_count"]
    return ml_part + imdb_part


# ---- Function to play one knockout round ----

def play_round(df, score_func, round_name="Round", random_state=42, side_label=""):
    """
    Play a knockout round:
        - Shuffle contestants
        - Pair them 1 vs 1
        - Winner decided by score_func(row)
    Returns:
        winners_df, matches (list of dicts with match results)
    """
    # Shuffle to randomize matchups
    shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    winners = []
    matches = []

    # We assume an even number of rows (8, 4, 2, etc.)
    for i in range(0, len(shuffled), 2):
        p1 = shuffled.iloc[i]
        p2 = shuffled.iloc[i + 1]
        
        s1 = score_func(p1)
        s2 = score_func(p2)
        
        # Decide winner (tie goes to p1)
        winner = p1 if s1 >= s2 else p2
        
        matches.append({
            "round": round_name,
            "side": side_label,
            "character_1": p1["character"],
            "score_1": s1,
            "character_2": p2["character"],
            "score_2": s2,
            "winner": winner["character"],
        })
        
        winners.append(winner)

    winners_df = pd.DataFrame(winners).reset_index(drop=True)
    return winners_df, matches


# ---- Main tournament function ----

def run_tournament(df_characters_heroes, df_characters_vilains, base_seed=42):
    """
    Run the heroes vs villains tournament.
    
    Input:
        df_characters_heroes  - dataframe with 8 heroes
        df_characters_vilains - dataframe with 8 villains
        
    Returns:
        results: dict with
            - "matches": list of all match dicts
            - "hero_champion": row (Series) of hero champion
            - "villain_champion": row (Series) of villain champion
            - "grand_final": dict with final match info
    """
    all_matches = []
    
    # ---------- HEROES BRACKET ----------
    # Round 1: 8 -> 4 (by total ratings)
    heroes_r1_winners, matches = play_round(
        df_characters_heroes,
        score_func=total_ratings,
        round_name="Heroes R1 (total ratings)",
        random_state=base_seed,
        side_label="heroes"
    )
    all_matches.extend(matches)
    
    # Round 2: 4 -> 2 (by weighted global rating)
    heroes_r2_winners, matches = play_round(
        heroes_r1_winners,
        score_func=weighted_global_rating,
        round_name="Heroes R2 (weighted rating)",
        random_state=base_seed + 1,
        side_label="heroes"
    )
    all_matches.extend(matches)
    
    # Round 3: 2 -> 1 champion (by product score)
    heroes_champion_df, matches = play_round(
        heroes_r2_winners,
        score_func=product_score,
        round_name="Heroes Final (product score)",
        random_state=base_seed + 2,
        side_label="heroes"
    )
    all_matches.extend(matches)
    hero_champion = heroes_champion_df.iloc[0]
    
    # ---------- VILLAINS BRACKET ----------
    villains_r1_winners, matches = play_round(
        df_characters_vilains,
        score_func=total_ratings,
        round_name="Villains R1 (total ratings)",
        random_state=base_seed,
        side_label="villains"
    )
    all_matches.extend(matches)
    
    villains_r2_winners, matches = play_round(
        villains_r1_winners,
        score_func=weighted_global_rating,
        round_name="Villains R2 (weighted rating)",
        random_state=base_seed + 1,
        side_label="villains"
    )
    all_matches.extend(matches)
    
    villains_champion_df, matches = play_round(
        villains_r2_winners,
        score_func=product_score,
        round_name="Villains Final (product score)",
        random_state=base_seed + 2,
        side_label="villains"
    )
    all_matches.extend(matches)
    villain_champion = villains_champion_df.iloc[0]
    
    # ---------- GRAND FINAL ----------
    # Hero champion vs Villain champion using product_score again
    grand_final_contestants = pd.DataFrame([hero_champion, villain_champion]).reset_index(drop=True)
    gf_winners_df, gf_matches = play_round(
        grand_final_contestants,
        score_func=product_score,
        round_name="Grand Final (Hero vs Villain)",
        random_state=base_seed + 3,
        side_label="grand_final"
    )
    all_matches.extend(gf_matches)
    
    grand_final = gf_matches[0]  # only one match
    grand_champion = gf_winners_df.iloc[0]
    
    results = {
        "matches": all_matches,
        "hero_champion": hero_champion,
        "villain_champion": villain_champion,
        "grand_final": grand_final,
        "grand_champion": grand_champion,
    }
    
    return results


### 4.3 Figth!

In this chapter the competition is triggerd by the lauch of the of the sub "run_tournament"

In [1290]:
results = run_tournament(df_characters_heroes, df_characters_vilains, base_seed=42)

### 4.4 Competiton results

#### 4.4.1 All rounds

In [1291]:
# All matches as a DataFrame
matches_df = pd.DataFrame(results["matches"])
matches_df

Unnamed: 0,round,side,character_1,score_1,character_2,score_2,winner
0,Heroes R1 (total ratings),heroes,Superman,1654287.0,Ripley,2500612.0,Ripley
1,Heroes R1 (total ratings),heroes,Batman,4574022.0,Neo,3428012.0,Batman
2,Heroes R1 (total ratings),heroes,Sir James Bond,5651674.0,Frodo,6180665.0,Frodo
3,Heroes R1 (total ratings),heroes,Harry Potter,7067019.0,John McClane,2485335.0,Harry Potter
4,Heroes R2 (weighted rating),heroes,Frodo,4.451638,Batman,3.585453,Frodo
5,Heroes R2 (weighted rating),heroes,Harry Potter,3.864674,Ripley,3.955209,Ripley
6,Heroes Final (product score),heroes,Ripley,9890443.0,Frodo,27514090.0,Frodo
7,Villains R1 (total ratings),villains,Hannibal Lecter,2292355.0,Cruella DeVil,368792.0,Hannibal Lecter
8,Villains R1 (total ratings),villains,Darth Vader,4216065.0,Loki,4122068.0,Darth Vader
9,Villains R1 (total ratings),villains,The Joker,4029988.0,Chucky,364729.0,The Joker


#### 4.4.2 Results of the hero champion

In [1292]:
# Campeão dos heróis
results["hero_champion"]

character                    Frodo
movies_count                     3
character_avg_rating      4.083608
character_rating_count         571
character_imdb_rating     4.451672
character_imdb_count       6180094
Name: 0, dtype: object

#### 4.4.3 Results of the Vilain Champion

In [1293]:
# Campeão dos vilões
results["villain_champion"]

character                 Michael Corleone
movies_count                             3
character_avg_rating              4.165082
character_rating_count                 366
character_imdb_rating             4.476614
character_imdb_count               4080161
Name: 0, dtype: object

#### 4.4.4 Grand Finale

In [1294]:
# Finalíssima (detalhe do combate final)
results["grand_final"]

{'round': 'Grand Final (Hero vs Villain)',
 'side': 'grand_final',
 'character_1': 'Frodo',
 'score_1': np.float64(27514085.840000004),
 'character_2': 'Michael Corleone',
 'score_2': np.float64(18266830.22),
 'winner': 'Frodo'}

#### 4.4.5 Final result

In [1295]:
# Grande campeão absoluto
results["grand_champion"]

character                    Frodo
movies_count                     3
character_avg_rating      4.083608
character_rating_count         571
character_imdb_rating     4.451672
character_imdb_count       6180094
Name: 0, dtype: object

## 5 Close conection to duckdb

In [1296]:
con.close()
print("Ligação fechada.")

Ligação fechada.


## 6 GARBAGE TO DELETE

### 2.5 Mr. Incredible from the Incredibles

In [1297]:
#df= search_by_character("Mr. incredible")
#df

In [1298]:
#summary_df = summarize_character(df)
#df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)
#df_characters_heroes

### 2.6 Indiana Jones

In [1247]:
#df= search_by_character_and_actor("indiana Jones","Harrison Ford")
#df

In [None]:
#summary_df = summarize_character(df)
#df_characters_heroes = pd.concat([df_characters_heroes, summary_df], ignore_index=True)
#df_characters_heroes