# Imports and setup

In [1]:
# TODO: 
# 1. Construct API with pre-trained model (for ex: giving an activity.instance_id, return the winning team)
# 2. Pretty Github

In [7]:
import time
from pathlib import Path
import pandas as pd
import numpy as np
import os, sys
import seaborn as sns
import json

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

sys.path.extend([str(Path(os.getcwd()).parent.absolute())]) # include from top, pb with jupyter notebooks...
from db import MainDBHelper
from models.guardian import Guardian
from models.activity import Activity
from local_api import BungieAPI

In [3]:
ROOT_DATA_FOLDER = "../data"

In [4]:
db_helper = MainDBHelper(name="main.db", folder=ROOT_DATA_FOLDER)

In [5]:
def pad(l: list, n: int, value=0):
    return l + [value] * (n - len(l))

### Get activities

In [6]:
# Get activities
request = "SELECT * FROM main.activity WHERE (activity.mode=71 OR activity.mode=73) AND activity.period > date('2022-01-01')"
r = db_helper.execute(request, [])

In [7]:
activities = pd.DataFrame(data=[dict(row) for row in r])
activities

Unnamed: 0,instance_id,period,mode,is_private,win_score,loss_score,players
0,9868839001,2022-01-01T00:07:14Z,73,0,97,32,"[{""membership_id"": ""4611686018492356731"", ""mem..."
1,9868853647,2022-01-01T00:05:57Z,73,0,150,82,"[{""membership_id"": ""4611686018475284935"", ""mem..."
2,9868858383,2022-01-01T00:05:14Z,73,0,144,111,"[{""membership_id"": ""4611686018476438205"", ""mem..."
3,9868885277,2022-01-01T00:15:00Z,73,0,150,79,"[{""membership_id"": ""4611686018441517867"", ""mem..."
4,9868933563,2022-01-01T00:21:25Z,73,0,150,85,"[{""membership_id"": ""4611686018511307572"", ""mem..."
...,...,...,...,...,...,...,...
88329,11980093257,2022-11-29T12:45:19Z,73,0,150,104,"[{""membership_id"": ""4611686018467326754"", ""mem..."
88330,11980110261,2022-11-29T12:57:04Z,73,0,153,75,"[{""membership_id"": ""4611686018513125077"", ""mem..."
88331,11980119969,2022-11-29T13:06:36Z,73,0,97,36,"[{""membership_id"": ""4611686018475052461"", ""mem..."
88332,11980134517,2022-11-29T13:10:23Z,73,0,150,118,"[{""membership_id"": ""4611686018487627730"", ""mem..."


In [8]:
# Select activities from date interval (better do it in sql)
activities.period = pd.to_datetime(activities.period, format=BungieAPI.API_DATE_FORMAT)
activities = activities[activities.period > "2022"]
activities.reset_index(drop=True, inplace=True)
activities

Unnamed: 0,instance_id,period,mode,is_private,win_score,loss_score,players
0,9868826934,2022-01-01 00:01:15,84,0,5,3,"[{""membership_id"": ""4611686018429273020"", ""mem..."
1,9868839001,2022-01-01 00:07:14,73,0,97,32,"[{""membership_id"": ""4611686018492356731"", ""mem..."
2,9868851944,2022-01-01 00:08:20,84,0,5,2,"[{""membership_id"": ""4611686018485879325"", ""mem..."
3,9868853647,2022-01-01 00:05:57,73,0,150,82,"[{""membership_id"": ""4611686018475284935"", ""mem..."
4,9868858383,2022-01-01 00:05:14,73,0,144,111,"[{""membership_id"": ""4611686018476438205"", ""mem..."
...,...,...,...,...,...,...,...
118887,11968726555,2022-11-25 23:30:43,84,0,5,0,"[{""membership_id"": ""4611686018452635660"", ""mem..."
118888,11970327060,2022-11-26 12:22:32,37,0,4,0,"[{""membership_id"": ""4611686018447676806"", ""mem..."
118889,11970352533,2022-11-26 12:34:08,37,0,4,1,"[{""membership_id"": ""4611686018467772266"", ""mem..."
118890,11970377921,2022-11-26 12:49:45,37,0,4,1,"[{""membership_id"": ""4611686018438166973"", ""mem..."


### Setup train df - combat_rating, kills_pga, assists_pga, deaths_pga, score_pga, win_ratio and winning team

In [20]:
# To use different features, simply modify the STATS_NAME array and the next function get_guardian_stats accordingly. The rest of the cells should adapt nicely.
# STATS_NAME = ["combat_rating", "kills_pga", "assists_pga", "deaths_pga", "score_pga", "win_ratio"]  # best version 85% acc
# STATS_NAME = ["kills_pga", "assists_pga", "deaths_pga", "score_pga", "win_ratio"] # 82 % acc
# STATS_NAME = ["activities_entered", "activities_won", "assists", "kills", "seconds_played", "deaths", "average_lifespan", "score", "opponents_defeated", "precision_kills"] # 80% acc
STATS_NAME = ["combat_rating", "kills_pga", "assists_pga", "deaths_pga", "score_pga", "win_ratio", "kd", "kda"] # 85% acc
N_STATS = len(STATS_NAME)
PLAYERS_COLUMNS = np.array([[f"player_{i}_{stat}" for stat in STATS_NAME] for i in range(1, 13)]).reshape(12 * N_STATS)

In [21]:
def get_guardian_stats(membership_id, membership_type, character_id):
    guardian = db_helper.get_guardian_from_ids(
        Guardian(membership_id=membership_id, membership_type=membership_type, character_id=character_id))

    if guardian is None:
        return pad([], N_STATS, -1)

    n = guardian.activities_entered
    if n < 1:
        return pad([], N_STATS, -1)
    
    try:
        # default
        # stats = [guardian.combat_rating,
        #          guardian.kills / n,
        #          guardian.assists / n,
        #          guardian.deaths / n,
        #          guardian.score / n,
        #          guardian.activities_won / n]
    
        # without combat_rating
        # stats = [guardian.kills / n,
        #          guardian.assists / n,
        #          guardian.deaths / n,
        #          guardian.score / n,
        #          guardian.activities_won / n]
        
        # raw
        # stats = [guardian.activities_entered,
        #          guardian.activities_won,
        #          guardian.assists,
        #          guardian.kills,
        #          guardian.seconds_played,
        #          guardian.deaths,
        #          guardian.average_lifespan,
        #          guardian.score,
        #          guardian.opponents_defeated,
        #          guardian.precision_kills]
        
        # with kd and kda
        stats = [guardian.combat_rating,
                 guardian.kills / n,
                 guardian.assists / n,
                 guardian.deaths / n,
                 guardian.score / n,
                 guardian.activities_won / n,
                 guardian.kills / guardian.deaths,
                 (guardian.kills + guardian.assists) / guardian.deaths]
        
    except ZeroDivisionError as err:
        return pad([], N_STATS, -1)

    return np.array(stats)

In [22]:
def extract_players_stats(players_json):
    try:
        (players_json) = (players_json.replace("'", '"')
                          .replace("False", "false")
                          .replace("True", "true"))
        players = json.loads(players_json)
    except json.JSONDecodeError as err:
        return np.array([pad([], N_STATS, 0) for i in range(12)])

    winners = []
    losers = []
    for player in players:
        player_stats = get_guardian_stats(player["membership_id"], player["membership_type"], player["character_id"])
        if player["is_winner"]:
            winners.append(player_stats)
        else:
            losers.append(player_stats)

    return np.array(
        pad(winners, 6, pad([], N_STATS, 0))[:6] + pad(losers, 6, pad([], N_STATS, 0))[:6])  # return winners first

In [23]:
t = activities.players.apply(extract_players_stats)
stack = np.stack(t, axis=0)
stack = np.reshape(stack, (len(stack), 12 * N_STATS))

In [24]:
train = pd.DataFrame()
train[PLAYERS_COLUMNS] = pd.DataFrame(stack, columns=PLAYERS_COLUMNS)
train["winner"] = np.zeros(len(train))
train

Unnamed: 0,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_combat_rating,player_2_kills_pga,...,player_11_kda,player_12_combat_rating,player_12_kills_pga,player_12_assists_pga,player_12_deaths_pga,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda,winner
0,171.866448,15.488152,5.208531,10.516588,30.137441,0.563981,1.472735,1.968004,201.613244,18.811948,...,1.290753,109.085451,10.026667,3.109630,12.373827,16.361975,0.438025,0.810312,1.061620,0.0
1,144.329975,12.304066,4.525944,8.136786,22.689351,0.518006,1.512153,2.068385,104.250846,9.541262,...,0.857883,59.681777,5.270936,2.586207,10.221675,7.586207,0.399015,0.515663,0.768675,0.0
2,130.682790,12.390798,3.461270,7.910600,16.080082,0.502038,1.566354,2.003902,132.519180,12.817466,...,1.309036,84.624560,5.511628,3.348837,11.604651,13.651163,0.511628,0.474950,0.763527,0.0
3,172.773079,14.569876,3.473085,8.370600,20.961698,0.573499,1.740601,2.155516,144.702941,12.669959,...,1.361126,121.290846,11.846154,3.131222,10.610860,21.524887,0.443439,1.116418,1.411514,0.0
4,160.782433,16.085000,4.530000,8.740000,25.575000,0.490000,1.840389,2.358696,168.364910,13.360000,...,1.470331,139.976587,11.367798,5.058406,8.987372,24.823204,0.440410,1.264863,1.827698,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88329,151.835510,13.185556,3.380030,7.398956,18.242985,0.639983,1.782083,2.238909,168.818530,13.645445,...,1.506958,115.575588,9.879747,3.510549,7.845992,20.166667,0.443038,1.259209,1.706642,0.0
88330,168.818530,13.645445,3.984875,9.429828,34.108688,0.543088,1.447051,1.869633,151.835510,13.185556,...,1.185786,98.068328,8.000000,2.665044,8.201558,9.972736,0.543330,0.975424,1.300368,0.0
88331,125.366135,10.766718,4.420878,10.269270,17.177131,0.483665,1.048440,1.478936,67.565632,5.151515,...,1.844141,83.093245,7.388227,2.852832,9.560730,11.857675,0.292101,0.772768,1.071159,0.0
88332,114.896575,11.761329,2.908071,8.903323,16.892102,0.460509,1.321004,1.647632,170.366888,15.070395,...,0.855049,102.747137,8.598291,3.422748,8.274162,8.852728,0.491124,1.039174,1.452841,0.0


In [25]:
# Select only valid rows (better results without)
req = [f"(train['{col}'] > 0)" for col in PLAYERS_COLUMNS]
req = " & ".join(req)
train = train[eval(req)]  # hey, it's not stupid if it works
train.reset_index(inplace=True, drop=True)
train

Unnamed: 0,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_combat_rating,player_2_kills_pga,...,player_11_kda,player_12_combat_rating,player_12_kills_pga,player_12_assists_pga,player_12_deaths_pga,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda,winner
0,171.866448,15.488152,5.208531,10.516588,30.137441,0.563981,1.472735,1.968004,201.613244,18.811948,...,1.290753,109.085451,10.026667,3.109630,12.373827,16.361975,0.438025,0.810312,1.061620,0.0
1,144.329975,12.304066,4.525944,8.136786,22.689351,0.518006,1.512153,2.068385,104.250846,9.541262,...,0.857883,59.681777,5.270936,2.586207,10.221675,7.586207,0.399015,0.515663,0.768675,0.0
2,130.682790,12.390798,3.461270,7.910600,16.080082,0.502038,1.566354,2.003902,132.519180,12.817466,...,1.309036,84.624560,5.511628,3.348837,11.604651,13.651163,0.511628,0.474950,0.763527,0.0
3,172.773079,14.569876,3.473085,8.370600,20.961698,0.573499,1.740601,2.155516,144.702941,12.669959,...,1.361126,121.290846,11.846154,3.131222,10.610860,21.524887,0.443439,1.116418,1.411514,0.0
4,160.782433,16.085000,4.530000,8.740000,25.575000,0.490000,1.840389,2.358696,168.364910,13.360000,...,1.470331,139.976587,11.367798,5.058406,8.987372,24.823204,0.440410,1.264863,1.827698,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74470,151.835510,13.185556,3.380030,7.398956,18.242985,0.639983,1.782083,2.238909,168.818530,13.645445,...,1.506958,115.575588,9.879747,3.510549,7.845992,20.166667,0.443038,1.259209,1.706642,0.0
74471,168.818530,13.645445,3.984875,9.429828,34.108688,0.543088,1.447051,1.869633,151.835510,13.185556,...,1.185786,98.068328,8.000000,2.665044,8.201558,9.972736,0.543330,0.975424,1.300368,0.0
74472,125.366135,10.766718,4.420878,10.269270,17.177131,0.483665,1.048440,1.478936,67.565632,5.151515,...,1.844141,83.093245,7.388227,2.852832,9.560730,11.857675,0.292101,0.772768,1.071159,0.0
74473,114.896575,11.761329,2.908071,8.903323,16.892102,0.460509,1.321004,1.647632,170.366888,15.070395,...,0.855049,102.747137,8.598291,3.422748,8.274162,8.852728,0.491124,1.039174,1.452841,0.0


In [26]:
# Shuffle teams so that half of the activities were won
def get_cols_name(player_range_start: int, player_range_end: int):
    return np.array(
        [[f"player_{i}_{stat}" for stat in STATS_NAME] for i in range(player_range_start, player_range_end)]).reshape(
        (player_range_end - player_range_start) * N_STATS)

# Not optimal - lot of memory used, prefer renaming columns (current draft not working, must select subset)
# tmp_cols = [f"tmp_{i}" for i in range(6*N_STATS)]
# train[tmp_cols] = train[get_cols_name(1, 7)]
# train[get_cols_name(1, 7)] = train[get_cols_name(7, 13)]
# train[get_cols_name(7, 13)] = train[tmp_cols]
# train.drop(tmp_cols, axis=1, inplace=True)

# Not better - renaming columns but the rvalue consume memory as well
mid = len(train) // 2

player_cols_team_A = get_cols_name(1, 7)
player_cols_team_B = get_cols_name(7, 13)
tmp_cols = [f"tmp_{i}" for i in range(6*N_STATS)]

# False positive warning (see https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas)
train.loc[:mid] = (train.loc[:mid].rename(columns={player_cols_team_A[i]: tmp_cols[i] for i in range(len(player_cols_team_A))})
                                  .rename(columns={player_cols_team_B[i]: player_cols_team_A[i] for i in range(len(player_cols_team_A))})
                                  .rename(columns={tmp_cols[i]: player_cols_team_B[i] for i in range(len(player_cols_team_A))}))\
                                  [list(PLAYERS_COLUMNS) + ["winner"]]  # reorder columns
train.loc[:mid, "winner"] = [1 for i in range(mid + 1)]
train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:mid] = (train.loc[:mid].rename(columns={player_cols_team_A[i]: tmp_cols[i] for i in range(len(player_cols_team_A))})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:mid, "winner"] = [1 for i in range(mid + 1)]


Unnamed: 0,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_combat_rating,player_2_kills_pga,...,player_11_kda,player_12_combat_rating,player_12_kills_pga,player_12_assists_pga,player_12_deaths_pga,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda,winner
0,154.918285,14.597444,4.210863,10.217252,29.795527,0.501597,1.428705,1.840838,169.595771,15.483871,...,1.537792,102.101709,7.812903,3.825806,10.064516,17.683871,0.425806,0.776282,1.156410,1.0
1,121.735362,11.464286,2.769793,10.471601,22.005594,0.466867,1.094798,1.359303,148.546626,11.564039,...,1.348943,107.831604,10.505824,3.575707,10.895175,18.885191,0.434276,0.964264,1.292456,1.0
2,182.747287,16.055660,6.420263,10.163852,34.513446,0.484053,1.579683,2.211359,122.641862,9.546075,...,1.081999,85.596959,4.898438,3.460938,9.592187,15.303125,0.468750,0.510669,0.871477,1.0
3,103.733618,7.773810,5.011905,11.011905,16.559524,0.404762,0.705946,1.161081,159.931058,13.592593,...,2.744592,86.356314,4.855319,3.042553,11.625532,15.293617,0.455319,0.417643,0.679356,1.0
4,175.309352,14.604278,4.836007,10.524064,30.843137,0.508021,1.387703,1.847222,152.829824,11.286429,...,1.325027,88.312201,5.327103,3.495327,11.009346,16.476636,0.411215,0.483871,0.801358,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74470,151.835510,13.185556,3.380030,7.398956,18.242985,0.639983,1.782083,2.238909,168.818530,13.645445,...,1.506958,115.575588,9.879747,3.510549,7.845992,20.166667,0.443038,1.259209,1.706642,0.0
74471,168.818530,13.645445,3.984875,9.429828,34.108688,0.543088,1.447051,1.869633,151.835510,13.185556,...,1.185786,98.068328,8.000000,2.665044,8.201558,9.972736,0.543330,0.975424,1.300368,0.0
74472,125.366135,10.766718,4.420878,10.269270,17.177131,0.483665,1.048440,1.478936,67.565632,5.151515,...,1.844141,83.093245,7.388227,2.852832,9.560730,11.857675,0.292101,0.772768,1.071159,0.0
74473,114.896575,11.761329,2.908071,8.903323,16.892102,0.460509,1.321004,1.647632,170.366888,15.070395,...,0.855049,102.747137,8.598291,3.422748,8.274162,8.852728,0.491124,1.039174,1.452841,0.0


### SKlearn basic models with train v2

In [27]:
X = train[PLAYERS_COLUMNS]
Y = train["winner"]
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
X

Unnamed: 0,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_combat_rating,player_2_kills_pga,...,player_11_kd,player_11_kda,player_12_combat_rating,player_12_kills_pga,player_12_assists_pga,player_12_deaths_pga,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda
0,154.918285,14.597444,4.210863,10.217252,29.795527,0.501597,1.428705,1.840838,169.595771,15.483871,...,1.215391,1.537792,102.101709,7.812903,3.825806,10.064516,17.683871,0.425806,0.776282,1.156410
1,121.735362,11.464286,2.769793,10.471601,22.005594,0.466867,1.094798,1.359303,148.546626,11.564039,...,0.971903,1.348943,107.831604,10.505824,3.575707,10.895175,18.885191,0.434276,0.964264,1.292456
2,182.747287,16.055660,6.420263,10.163852,34.513446,0.484053,1.579683,2.211359,122.641862,9.546075,...,0.625475,1.081999,85.596959,4.898438,3.460938,9.592187,15.303125,0.468750,0.510669,0.871477
3,103.733618,7.773810,5.011905,11.011905,16.559524,0.404762,0.705946,1.161081,159.931058,13.592593,...,1.987924,2.744592,86.356314,4.855319,3.042553,11.625532,15.293617,0.455319,0.417643,0.679356
4,175.309352,14.604278,4.836007,10.524064,30.843137,0.508021,1.387703,1.847222,152.829824,11.286429,...,0.959855,1.325027,88.312201,5.327103,3.495327,11.009346,16.476636,0.411215,0.483871,0.801358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74470,151.835510,13.185556,3.380030,7.398956,18.242985,0.639983,1.782083,2.238909,168.818530,13.645445,...,1.191425,1.506958,115.575588,9.879747,3.510549,7.845992,20.166667,0.443038,1.259209,1.706642
74471,168.818530,13.645445,3.984875,9.429828,34.108688,0.543088,1.447051,1.869633,151.835510,13.185556,...,0.811721,1.185786,98.068328,8.000000,2.665044,8.201558,9.972736,0.543330,0.975424,1.300368
74472,125.366135,10.766718,4.420878,10.269270,17.177131,0.483665,1.048440,1.478936,67.565632,5.151515,...,1.429352,1.844141,83.093245,7.388227,2.852832,9.560730,11.857675,0.292101,0.772768,1.071159
74473,114.896575,11.761329,2.908071,8.903323,16.892102,0.460509,1.321004,1.647632,170.366888,15.070395,...,0.647457,0.855049,102.747137,8.598291,3.422748,8.274162,8.852728,0.491124,1.039174,1.452841


In [19]:
# Mean of every stat per team -> Decrease performance (79% acc)
TEAM_COLUMNS = [f"team_A_{stat}" for stat in STATS_NAME] + [f"team_B_{stat}" for stat in STATS_NAME]
for i in range(len(STATS_NAME)):
    col = TEAM_COLUMNS[i]
    stat = STATS_NAME[i]
    sel = [f"player_{k}_{stat}" for k in range(1, 7)]
    X[col] = X.loc[:, sel].mean(axis=1)
for i in range(len(STATS_NAME)):
    col = TEAM_COLUMNS[i+6]
    stat = STATS_NAME[i]
    sel = [f"player_{k}_{stat}" for k in range(7, 13)]
    X[col] = X.loc[:, sel].mean(axis=1)
X.drop(columns=PLAYERS_COLUMNS, inplace=True)
X

Unnamed: 0,team_A_combat_rating,team_A_kills_pga,team_A_assists_pga,team_A_deaths_pga,team_A_score_pga,team_A_win_ratio,team_A_kd,team_A_kda,team_B_combat_rating,team_B_kills_pga,team_B_assists_pga,team_B_deaths_pga,team_B_score_pga,team_B_win_ratio
0,113.060126,8.068427,3.438515,9.351107,15.760801,0.434917,104.333605,7.632033,3.185438,9.173066,15.045864,0.456978,0.851435,1.211079
1,145.938238,11.010223,3.825794,8.099489,17.927997,0.531433,151.851179,11.400594,3.769069,8.781801,20.011913,0.538099,1.338476,1.779117
2,120.329533,9.812137,3.539109,10.194328,17.140939,0.390322,123.341654,9.614351,3.711304,9.620290,18.674679,0.447699,1.003197,1.390126
3,128.721577,9.889536,3.748398,9.801619,18.249298,0.481573,128.830919,10.258719,4.036704,9.195230,19.272906,0.523155,1.119120,1.558877
4,121.834889,9.508596,3.524176,10.904994,18.163205,0.473125,131.409405,10.581667,3.953761,9.004254,17.860801,0.520287,1.179598,1.617527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112758,124.579855,10.542732,4.017803,10.803507,22.917658,0.432988,118.056240,9.332051,3.947713,11.043002,20.703011,0.438001,0.843833,1.202667
112759,122.805618,9.363540,3.904330,11.317119,24.280477,0.626596,110.968929,9.029639,3.735980,10.700439,19.245703,0.441516,0.844396,1.197644
112760,114.248576,9.585313,3.827678,9.545315,18.958146,0.409039,133.597176,11.543925,4.037514,10.504738,22.039461,0.461644,1.147245,1.546002
112761,135.329443,10.794326,4.130578,9.382815,22.181775,0.499960,139.235930,11.316749,4.031736,9.479446,21.308441,0.485789,1.237723,1.681636


In [22]:
pipeline = make_pipeline(StandardScaler(), SVC())
acc_scores = cross_val_score(pipeline, X, Y, cv=4, scoring="accuracy", n_jobs=-1)
print(f"Mean acc = {acc_scores.mean()}, Std acc = {acc_scores.std()}")
pipeline

Mean acc = 0.7821262006777389, Std acc = 0.06417286907833572


Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])

In [23]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
acc_scores = cross_val_score(pipeline, X, Y, cv=4, scoring="accuracy", n_jobs=-1)
print(f"Mean acc = {acc_scores.mean()}, Std acc = {acc_scores.std()}")
print(pipeline)

Mean acc = 0.7168123494715248, Std acc = 0.05309508461203796
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier', RandomForestClassifier())])


In [124]:
forest = RandomForestClassifier()
forest.fit(X, Y)
importances = {
    "combat_rating": forest.feature_importances_[[i for i in range(0, 12*N_STATS, N_STATS)]].sum(),
    "kills_pga": forest.feature_importances_[[i for i in range(1, 12*N_STATS, N_STATS)]].sum(),
    "assists_pga": forest.feature_importances_[[i for i in range(2, 12*N_STATS, N_STATS)]].sum(),
    "deaths_pga": forest.feature_importances_[[i for i in range(3, 12*N_STATS, N_STATS)]].sum(),
    "score_pga": forest.feature_importances_[[i for i in range(4, 12*N_STATS, N_STATS)]].sum(),
    "win_ratio": forest.feature_importances_[[i for i in range(5, 12*N_STATS, N_STATS)]].sum()
}
pd.DataFrame([v for v in importances.values()], index=[k for k in importances], columns=["value"]).sort_values(by=["value"], ascending=False)



Unnamed: 0,value
deaths_pga,0.093478
assists_pga,0.084786
kills_pga,0.082987
score_pga,0.08234
win_ratio,0.082321
combat_rating,0.081257


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)

clf = CatBoostClassifier(verbose=False)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(clf)

probs = clf.predict_proba(X_test).max(axis=1)
confusion_matrix_mean_prob = np.array([[probs[np.where((y_test == y_pred) & (y_test == 0))].mean(), probs[np.where((y_pred == 1) & (y_test == 0))].mean()],
                                       [probs[np.where((y_pred == 0) & (y_test == 1))].mean(), probs[np.where((y_test == y_pred) & (y_test == 1))].mean()]])
print("Mean proba for each section of the confusion matrix:")
print(confusion_matrix_mean_prob)

Accuracy: 0.7892404780020588
F1-score: 0.7922530551021308
[[8655 2516]
 [2193 8979]]
<catboost.core.CatBoostClassifier object at 0x000002670D3B62B0>
Mean proba for each section of the confusion matrix:
[[0.80647651 0.66993496]
 [0.66332882 0.79536047]]


In [29]:
clf.save_model("model_quickplay.json", format="json")

# Test on competitive (3v3)
Result: need 2 models, one for quickplay (6v6) and one for competitive (at least for osiris) (3v3)

In [31]:
# Get activities
request = "SELECT * FROM main.activity WHERE activity.mode=84"
r = db_helper.execute(request, [])
activities = pd.DataFrame(data=[dict(row) for row in r])
activities

Unnamed: 0,instance_id,period,mode,is_private,win_score,loss_score,players
0,9868826934,2022-01-01T00:01:15Z,84,0,5,3,"[{""membership_id"": ""4611686018429273020"", ""mem..."
1,9868851944,2022-01-01T00:08:20Z,84,0,5,2,"[{""membership_id"": ""4611686018485879325"", ""mem..."
2,9868863696,2022-01-01T00:11:17Z,84,0,5,1,"[{""membership_id"": ""4611686018430058840"", ""mem..."
3,9868888247,2022-01-01T00:16:11Z,84,0,5,2,"[{""membership_id"": ""4611686018431697058"", ""mem..."
4,9868894831,2022-01-01T00:18:44Z,84,0,5,0,"[{""membership_id"": ""4611686018428440611"", ""mem..."
...,...,...,...,...,...,...,...
25032,11968655176,2022-11-25T23:01:08Z,84,0,5,2,"[{""membership_id"": ""4611686018429021853"", ""mem..."
25033,11968667869,2022-11-25T23:08:27Z,84,0,5,0,"[{""membership_id"": ""4611686018428638688"", ""mem..."
25034,11968684496,2022-11-25T23:13:47Z,84,0,5,2,"[{""membership_id"": ""4611686018428423357"", ""mem..."
25035,11968711243,2022-11-25T23:24:12Z,84,0,5,0,"[{""membership_id"": ""4611686018456907962"", ""mem..."


In [32]:
t = activities.players.apply(extract_players_stats)
stack2 = np.stack(t, axis=0)
stack2 = np.reshape(stack2, (len(stack2), 12 * N_STATS))

In [51]:
train_comp = pd.DataFrame()
train_comp[PLAYERS_COLUMNS] = pd.DataFrame(stack2, columns=PLAYERS_COLUMNS)
train_comp["winner"] = np.zeros(len(train_comp))
train_comp

Unnamed: 0,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_combat_rating,player_2_kills_pga,...,player_11_kda,player_12_combat_rating,player_12_kills_pga,player_12_assists_pga,player_12_deaths_pga,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda,winner
0,130.110954,11.506887,3.008264,9.904500,10.358127,0.474747,1.161784,1.465511,146.102935,11.198822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,193.190578,18.354146,5.396098,10.070244,32.717073,0.520976,1.822612,2.358458,140.919078,11.837428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,111.135054,6.916300,3.687225,9.810573,18.356828,0.458150,0.704984,1.080826,127.620833,10.906051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,126.847280,13.431384,3.364454,8.537120,13.847582,0.510124,1.573292,1.967389,177.763064,19.648368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,159.039104,14.026588,3.694239,11.502216,28.506647,0.500739,1.219468,1.540645,127.620833,10.906051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25032,150.178377,12.552256,4.720973,8.864941,22.594400,0.575166,1.415943,1.948488,0.000000,6.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25033,149.463505,11.095960,5.419192,13.058081,24.146465,0.424242,0.849739,1.264746,137.492195,10.796270,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25034,198.235609,17.878617,4.018414,8.514656,34.931229,0.588876,2.099746,2.571687,158.247174,14.123250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25035,156.639293,11.218408,2.836142,6.241997,15.604044,0.655855,1.797247,2.251611,150.178377,12.552256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# Shuffle winning team
mid = len(train_comp) // 2

player_cols_team_A = get_cols_name(1, 7)
player_cols_team_B = get_cols_name(7, 13)
tmp_cols = [f"tmp_{i}" for i in range(6*N_STATS)]

# False positive warning (see https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas)
train_comp.loc[:mid] = (train_comp.loc[:mid].rename(columns={player_cols_team_A[i]: tmp_cols[i] for i in range(len(player_cols_team_A))})
                                            .rename(columns={player_cols_team_B[i]: player_cols_team_A[i] for i in range(len(player_cols_team_A))})
                                            .rename(columns={tmp_cols[i]: player_cols_team_B[i] for i in range(len(player_cols_team_A))}))\
                                            [list(PLAYERS_COLUMNS) + ["winner"]]  # reorder columns
train_comp.loc[:mid, "winner"] = [1 for i in range(mid + 1)]
train_comp = train_comp.sample(frac=1.0).reset_index(drop=True)
train_comp

Unnamed: 0,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_combat_rating,player_2_kills_pga,...,player_11_kda,player_12_combat_rating,player_12_kills_pga,player_12_assists_pga,player_12_deaths_pga,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda,winner
0,95.910761,9.209150,2.389978,10.235294,13.435730,0.444444,0.899745,1.133248,113.134274,9.086331,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,106.892533,8.507407,3.366667,8.718519,7.781481,0.603704,0.975786,1.361937,114.584205,10.938983,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,87.947342,7.654267,2.757112,8.048140,17.010941,0.498906,0.951060,1.293638,141.391354,12.434084,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,135.327363,11.603383,4.292293,8.535714,23.344925,0.411654,1.359392,1.862255,111.799705,12.456746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,158.041529,13.086037,4.587512,9.130285,25.128810,0.568338,1.433256,1.935706,183.626286,10.580054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25032,110.368156,9.432633,3.193655,9.671611,12.336218,0.500206,0.975291,1.305500,99.064171,9.226101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25033,64.684944,6.666667,3.331267,10.447658,10.758264,0.457300,0.638102,0.956955,132.285745,12.422655,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25034,131.981020,9.013680,2.911739,7.484996,13.619594,0.648720,1.204233,1.593243,134.112921,10.735471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25035,185.031909,16.884868,5.029605,9.552632,28.309211,0.648026,1.767562,2.294077,150.178377,12.552256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
X, Y = train_comp[PLAYERS_COLUMNS], train_comp["winner"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)

clf_comp = CatBoostClassifier(verbose=False)
clf_comp.fit(X_train, y_train)

y_pred = clf_comp.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(clf)

probs = clf.predict_proba(X_test).max(axis=1)
confusion_matrix_mean_prob = np.array([[probs[np.where((y_test == y_pred) & (y_test == 0))].mean(), probs[np.where((y_pred == 1) & (y_test == 0))].mean()],
                                       [probs[np.where((y_pred == 0) & (y_test == 1))].mean(), probs[np.where((y_test == y_pred) & (y_test == 1))].mean()]])
print("Mean proba for each section of the confusion matrix:")
print(confusion_matrix_mean_prob)

Accuracy: 0.8463791267305645
F1-score: 0.8463791267305645
[[3179  577]
 [ 577 3179]]
<catboost.core.CatBoostClassifier object at 0x000001DDA1912250>
Mean proba for each section of the confusion matrix:
[[0.81300895 0.92122624]
 [0.86917212 0.94464431]]


### Compare time sql and pandas

In [42]:
membership_id = 4611686018489429030
membership_type = 3
character_id = 2305843009503194358

# Retrieve guardian with sql
start_time = time.time()
for i in range(1000):
    g = db_helper.get_guardian_from_ids(
        Guardian(membership_id=membership_id, membership_type=membership_type, character_id=character_id))
print(f"SQL query took {time.time() - start_time}s to complete.")

# Retrieve guardian with pandas
start_time = time.time()
for i in range(1000):
    g = guardians.loc[(guardians.membership_id == membership_id) & (guardians.membership_type == membership_type) & (
                guardians.character_id == character_id)]
print(f"Pandas query took {time.time() - start_time}s to complete.")

SQL query took 0.030026674270629883s to complete.
Pandas query took 0.9188318252563477s to complete.
