In [2]:
import pandas as pd

# Read the CSV files
# df_stan = pd.read_csv('../results/results_stan.csv')
# df_nico = pd.read_csv('../results/results_nico.csv')

# # Rename stan columns before the merge: player_1_won20240324081115 to player_1_won
# df_stan.rename(columns={' player_1_won20240324081115': ' player_1_won'}, inplace=True)

# Merge the dataframes in a df DataFrame
# df = pd.concat([df_stan, df_nico])
df = pd.read_csv('../results.csv')
# Strip all whitespaces from the column names
df.columns = df.columns.str.strip()

print(f"df shape: {df.shape}")

# Print the DataFrame
df.head()

df shape: (311, 6)


Unnamed: 0,id,player_1_model,player_1_temperature,player_2_model,player_2_temperature,player_1_won
0,20240328172609,claude_3_haiku,0.0,cohere_command,0.0,True
1,20240328172713,ai21_mid,0.0,claude_3_haiku,0.0,True
2,20240328172748,ai21_ultra,0.0,ai21_ultra,0.0,True
3,20240328172841,titan_express,0.0,ai21_mid,0.0,False
4,20240328172927,claude_2_1,0.0,cohere_light,0.0,False


In [4]:
# Check the type of player_1_won
print(f"player_1_won type: {df['player_1_won'].dtype}")

for index, row in df.iterrows():
    print(type(row["player_1_won"]))
    break

# Check the value types
print(f"player_1_won value types: {df['player_1_won'].value_counts()}")

player_1_won type: bool
<class 'bool'>
player_1_won value types: False    167
True     144
Name: player_1_won, dtype: int64


In [5]:
# Get the number of wins for each model
player_nb_wins = {player_id: 0 for player_id in pd.concat([df['player_1_model'], df['player_2_model']]).unique()}

# Go over the rows of the DataFrame
for index, row in df.iterrows():
    if row["player_1_won"] == "True":
        player_nb_wins[row["player_1_model"]] += 1
    else:
        player_nb_wins[row["player_2_model"]] += 1

print("Number of wins for each model:")
print(player_nb_wins)

Number of wins for each model:
{'claude_3_haiku': 30, 'ai21_mid': 17, 'ai21_ultra': 28, 'titan_express': 18, 'claude_2_1': 11, 'claude_instant': 24, 'mistral_7b': 24, 'titan_lite': 25, 'mistral_8x7b': 25, 'cohere_light': 35, 'cohere_command': 19, 'claude_3_sonnet': 29, 'claude_2': 26}


## Get the relative scores

In [6]:
# Get the win rate of each model

# Get a list of each model name in player_1_model and player_2_model
model_names = pd.concat([df['player_1_model'], df['player_2_model']]).unique()

# Go over the rows of the DataFrame
model_wins = {model_name: 0 for model_name in model_names}

for index, row in df.iterrows():
    if row["player_1_won"] == "True":
        model_wins[row["player_1_model"]] += 1
    else:
        model_wins[row["player_2_model"]] += 1

print("Number of wins for each model:")
print(model_wins)

Number of wins for each model:
{'claude_3_haiku': 30, 'ai21_mid': 17, 'ai21_ultra': 28, 'titan_express': 18, 'claude_2_1': 11, 'claude_instant': 24, 'mistral_7b': 24, 'titan_lite': 25, 'mistral_8x7b': 25, 'cohere_light': 35, 'cohere_command': 19, 'claude_3_sonnet': 29, 'claude_2': 26}


In [7]:
# Get the number of fight for each model
model_fights = {model_name: 0 for model_name in model_names}

for index, row in df.iterrows():
    model_fights[row["player_1_model"]] += 1
    model_fights[row["player_2_model"]] += 1

print("Number of fights for each model:")
print(model_fights)

Number of fights for each model:
{'claude_3_haiku': 52, 'ai21_mid': 43, 'ai21_ultra': 51, 'titan_express': 43, 'claude_2_1': 38, 'claude_instant': 43, 'mistral_7b': 50, 'titan_lite': 51, 'mistral_8x7b': 51, 'cohere_light': 61, 'cohere_command': 38, 'claude_3_sonnet': 53, 'claude_2': 48}


In [8]:
# Get the win rate of each model
model_win_rate = {model_name: model_wins[model_name] / model_fights[model_name] for model_name in model_names}

# Make it a nice dataframe for visualization
df_win_rate = pd.DataFrame(model_win_rate.items(), columns=["model_name", "win_rate"])

# Order it by desceding win rate
df_win_rate = df_win_rate.sort_values("win_rate", ascending=False)

df_win_rate.head(n=len(model_names))

Unnamed: 0,model_name,win_rate
0,claude_3_haiku,0.576923
9,cohere_light,0.57377
5,claude_instant,0.55814
2,ai21_ultra,0.54902
11,claude_3_sonnet,0.54717
12,claude_2,0.541667
10,cohere_command,0.5
7,titan_lite,0.490196
8,mistral_8x7b,0.490196
6,mistral_7b,0.48


## Calculate the ELO rating for each model

In [9]:
# Get the model 
model_names = pd.concat([df['player_1_model'], df['player_2_model']]).unique()

# Initialize player ratings
player_ratings = {player_id: 1500 for player_id in model_names}

def elo_expected_score(rating_a, rating_b):
    return 1 / (1 + 10**((rating_b - rating_a) / 400))

def elo_update(winner_rating, loser_rating, k=32):
    expected_score_winner = elo_expected_score(winner_rating, loser_rating)
    expected_score_loser = 1 - expected_score_winner
    new_winner_rating = winner_rating + k * (1 - expected_score_winner)
    new_loser_rating = loser_rating + k * (0 - expected_score_loser)
    return new_winner_rating, new_loser_rating

# Iterate through matches to update ELO ratings
for index, row in df.iterrows():
    player1, player2, player1_won = row['player_1_model'], row['player_2_model'], row['player_1_won']

    # If it's a match against itself, skip
    if player1 == player2:
        print(f"Skipping match between {player1} and {player2}")
        continue
    
    if player1_won == " True":
        winner, loser = player1, player2
    else:
        winner, loser = player2, player1
    
    new_winner_rating, new_loser_rating = elo_update(player_ratings[winner], player_ratings[loser])
    
    player_ratings[winner] = new_winner_rating
    player_ratings[loser] = new_loser_rating

# Print updated ratings
print(player_ratings)

# Make it a DataFrame so we can have a nice display
ratings_df = pd.DataFrame(player_ratings.items(), columns=['Model', 'Rating'])

# Sort the DataFrame by rating
ratings_df = ratings_df.sort_values(by='Rating', ascending=False)

# Display the ratings
ratings_df.head(n=len(model_names))

Skipping match between ai21_ultra and ai21_ultra
Skipping match between claude_2_1 and claude_2_1
Skipping match between claude_2_1 and claude_2_1
Skipping match between ai21_ultra and ai21_ultra
Skipping match between claude_2 and claude_2
Skipping match between ai21_mid and ai21_mid
Skipping match between cohere_light and cohere_light
Skipping match between claude_3_sonnet and claude_3_sonnet
Skipping match between ai21_ultra and ai21_ultra
Skipping match between mistral_8x7b and mistral_8x7b
Skipping match between titan_lite and titan_lite
Skipping match between cohere_light and cohere_light
Skipping match between claude_3_sonnet and claude_3_sonnet
Skipping match between claude_3_sonnet and claude_3_sonnet
Skipping match between claude_2_1 and claude_2_1
Skipping match between titan_lite and titan_lite
Skipping match between cohere_command and cohere_command
Skipping match between mistral_7b and mistral_7b
Skipping match between claude_3_haiku and claude_3_haiku
Skipping match betw

Unnamed: 0,Model,Rating
0,claude_3_haiku,1599.134775
5,claude_instant,1562.479052
11,claude_3_sonnet,1557.251756
12,claude_2,1554.983541
9,cohere_light,1513.339247
10,cohere_command,1511.455126
3,titan_express,1502.56213
6,mistral_7b,1490.064134
2,ai21_ultra,1477.17917
8,mistral_8x7b,1463.812338


In [10]:
mdwn = ratings_df.to_markdown()

In [11]:
print(mdwn)

|    | Model           |   Rating |
|---:|:----------------|---------:|
|  0 | claude_3_haiku  |  1599.13 |
|  5 | claude_instant  |  1562.48 |
| 11 | claude_3_sonnet |  1557.25 |
| 12 | claude_2        |  1554.98 |
|  9 | cohere_light    |  1513.34 |
| 10 | cohere_command  |  1511.46 |
|  3 | titan_express   |  1502.56 |
|  6 | mistral_7b      |  1490.06 |
|  2 | ai21_ultra      |  1477.18 |
|  8 | mistral_8x7b    |  1463.81 |
|  7 | titan_lite      |  1443.51 |
|  1 | ai21_mid        |  1435.68 |
|  4 | claude_2_1      |  1388.54 |


In [14]:
# Get the model names
model_names = pd.concat([df['player_1_model'], df['player_2_model']]).unique()
row_size = 0

player_ratings = {}

# Initialize player ratings
player_ratings = {player_id: 1500 for player_id in model_names}

def elo_expected_score(rating_a, rating_b):
    return 1 / (1 + 10**((rating_b - rating_a) / 400))

def elo_update(winner_rating, loser_rating, k=32):
    expected_score_winner = elo_expected_score(winner_rating, loser_rating)
    expected_score_loser = 1 - expected_score_winner
    new_winner_rating = winner_rating + k * (1 - expected_score_winner)
    new_loser_rating = loser_rating + k * (0 - expected_score_loser)
    return new_winner_rating, new_loser_rating

# Iterate through matches to update ELO ratings
for index, row in df.iterrows():
    player1, player2, player1_won = row['player_1_model'], row['player_2_model'], row['player_1_won']

    print(f"-{player1_won}-")

    # If it's a match against itself, skip
    if player1 == player2:
        print(f"Skipping match between {player1} and {player2}")
        continue

    row_size += 1
    
    if player1_won == " True":
        print("player1_won")
        winner, loser = player1, player2
    else:
        winner, loser = player2, player1
    
    new_winner_rating, new_loser_rating = elo_update(player_ratings[winner], player_ratings[loser])
    
    player_ratings[winner] = new_winner_rating
    player_ratings[loser] = new_loser_rating

# Print updated ratings
print(player_ratings)

# Make it a DataFrame so we can have a nice display
ratings_df_stan = pd.DataFrame(player_ratings.items(), columns=['Model', 'Rating'])

# Sort the DataFrame by rating
ratings_df_stan = ratings_df_stan.sort_values(by='Rating', ascending=False)

# Display the ratings
ratings_df_stan.head(n=len(model_names))

-True-
-True-
-True-
Skipping match between ai21_ultra and ai21_ultra
-False-
-False-
-False-
Skipping match between claude_2_1 and claude_2_1
-True-
-True-
-False-
-False-
-True-
-True-
-True-
-False-
-True-
-False-
-True-
-False-
-True-
-False-
Skipping match between claude_2_1 and claude_2_1
-False-
-True-
-False-
Skipping match between ai21_ultra and ai21_ultra
-True-
-True-
-False-
-True-
-True-
-False-
-False-
-True-
-False-
-True-
-False-
Skipping match between claude_2 and claude_2
-False-
-False-
-False-
-True-
-False-
-True-
-True-
-True-
-True-
-True-
-False-
-True-
Skipping match between ai21_mid and ai21_mid
-True-
-False-
-True-
-True-
-False-
-False-
-True-
-False-
-True-
-False-
-False-
-False-
-True-
-False-
-True-
-False-
-False-
-True-
-True-
-True-
-False-
-False-
-False-
-False-
-False-
-False-
-False-
-True-
-False-
-True-
-True-
Skipping match between cohere_light and cohere_light
-True-
-False-
-False-
-True-
-True-
-False-
-True-
Skipping match between claude_3

Unnamed: 0,Model,Rating
0,claude_3_haiku,1599.134775
5,claude_instant,1562.479052
11,claude_3_sonnet,1557.251756
12,claude_2,1554.983541
9,cohere_light,1513.339247
10,cohere_command,1511.455126
3,titan_express,1502.56213
6,mistral_7b,1490.064134
2,ai21_ultra,1477.17917
8,mistral_8x7b,1463.812338
