In [58]:
!pip install pyswarms

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


This code segment imports essential Python libraries for data manipulation, numerical computation, optimization, and interaction with Amazon S3. It initializes an S3 file system connection for working with S3 files. These tools are essential for a range of data analysis, machine learning, and optimization tasks involving data stored in an S3 bucket.

In [37]:
import pandas as pd
import numpy as np
from scipy.optimize import basinhopping, shgo
import pickle
from sklearn.model_selection import train_test_split
import pyswarms as ps
from s3fs.core import S3FileSystem

s3_file = S3FileSystem()

The code defines a class named EloRatingSystem that implements the Elo rating system for evaluating and adjusting player ratings in a competitive game. It allows adding players with initial ratings, calculating expected ratings in a matchup, updating ratings based on outcomes, and retrieving player ratings. This simplifies the management of ratings in a competitive environment.

In [3]:
class EloRatingSystem:
    def __init__(self, k_factor=32):
        self.k_factor = k_factor
        self.ratings = {}

    def add_player(self, player_name, rating=1200):
        if player_name not in self.ratings:
            self.ratings[player_name] = rating

    def get_rating(self, player_name):
        return self.ratings.get(player_name, 1200)
    
    def get_all_ratings(self):
        print(sorted(self.ratings.items(), key=lambda x:x[1], reverse=True))

    def calculate_expected_score(self, player_a, player_b):
        rating_a = self.get_rating(player_a)
        rating_b = self.get_rating(player_b)
        expected_score_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
        return expected_score_a

    def update_ratings(self, winner, loser, performance_factor_winner, performance_factor_loser):
        expected_score_winner = self.calculate_expected_score(winner, loser)
        expected_score_loser = 1 - expected_score_winner

        rating_winner = self.get_rating(winner)
        rating_loser = self.get_rating(loser)

        new_rating_winner = rating_winner + self.k_factor * performance_factor_winner * (1 - expected_score_winner)
        new_rating_loser = rating_loser + self.k_factor * performance_factor_loser * (0 - expected_score_loser)

        self.ratings[winner] = new_rating_winner
        self.ratings[loser] = new_rating_loser

The PerformanceCalculator class calculates performance metrics using two models for team and player performance. It offers methods to compute overall performance, team performance, and player performance. It provides flexibility in performance calculation based on input data, winner information, and performance weights.

In [18]:
class PerformanceCalculator:
    
    def __init__(self,team_model,player_model):
        self.team_performance_calculator = team_model
        self.player_performance_calculator = player_model
        
    def calculate_performance(self,match_data,team_perf_weight,player_perf_weight):
        teams_performances_data = match_data['team_stats']
        players_performances_data = match_data['players_stats']
        winner = match_data['winner']
        teams_probabilities = self.team_performance_calculator.predict_proba([teams_performances_data])[0]
        players_probabilities = self.player_performance_calculator.predict_proba([players_performances_data])[0]
        weighted_teams = teams_probabilities[winner] ** 3
        weighted_players = players_probabilities[winner] ** 3
        winning_team_performance = team_perf_weight*weighted_teams #+ player_perf_weight*weighted_players
        return winning_team_performance
    
    def calculate_team_performance(self,winner_data,loser_data,winner,winner_performance_factor,loser_performance_factor):
        winner_chance = self.team_performance_calculator.predict_proba([winner_data])[0][1]
        loser_chance = self.team_performance_calculator.predict_proba([loser_data])[0][1]
        winner_performance = winner_chance*winner_performance_factor
        loser_performance = loser_chance*loser_performance_factor
        return winner_performance,loser_performance
        
    def calculate_players_performance(self,winner_data,loser_data,winner,winner_performance_factor,loser_performance_factor):
        winner_chance = self.player_performance_calculator.predict_proba([winner_data])[0][1]
        loser_chance = self.player_performance_calculator.predict_proba([loser_data])[0][1]
        winner_performance = winner_chance*winner_performance_factor
        loser_performance = loser_chance*loser_performance_factor
        return winner_performance,loser_performance
    
    def calculate_performance_classic(self,teams_performances_data,winner,weights):
        if winner == 0:
            winner_data = teams_performances_data[::2]
            loser_data = teams_performances_data[1::2]
        else:
            winner_data = teams_performances_data[1::2]
            loser_data = teams_performances_data[::2]
        performance_diff = [np.sqrt((x - y)**2) for x, y in zip(winner_data, loser_data)]
        return np.dot(performance_diff,weights)

These lines define the S3 bucket paths for input data and output data, where the data related to Elo ratings is stored and where the calculated output will be saved, respectively. The input data is located in the "elo-rating-data/data" S3 bucket, and the calculated results will be stored in the "elo-rating-data/output" S3 bucket.

In [43]:
bucket = 's3://elo-rating-data/data'
output_bucket = 's3://elo-rating-data/output'

The code reads data from CSV files in an S3 bucket. It processes player and team statistics data using regular expressions to filter out specific columns that match the pattern "ID" or "Champion" from the column names. Player data is sorted by the 'platformGameID' column, while team data is sorted as well but not filtered based on the regex pattern.

In [14]:
regex_pattern = r'.*(ID|Champion).*'

norm_players_stats = pd.read_csv(f'{bucket}/norm_players_stats.csv', sep=';')
norm_players_all_df = norm_players_stats.sort_values(by='platformGameID',ascending=True)
norm_players_df = norm_players_all_df.filter(regex=f'^(?!{regex_pattern}).*$', axis=1)

norm_teams_stats = pd.read_csv(f'{bucket}/norm_teams_stats.csv', sep=';')
norm_teams_df = norm_teams_stats.sort_values(by='platformGameID',ascending=True)
# norm_teams_df = norm_teams_all_df.filter(regex=f'^(?!{regex_pattern}).*$', axis=1)

The provided code includes two functions. The first function, create_rows_per_team, processes a DataFrame by filtering its columns based on regular expression patterns and adds a 'winner' column derived from the 'winningTeam' values. The second function, get_datasets, prepares data for machine learning by splitting it into features and labels. It offers the flexibility to create a validation dataset if needed. These functions are essential for data preprocessing and dataset splitting within a machine learning workflow.

In [8]:
def create_rows_per_team(df, regex_pattern_100, regex_pattern_200, players_data = False):
    stats_100_df = df.filter(regex=regex_pattern_100, axis=1).copy()
    stats_200_df = df.filter(regex=regex_pattern_200, axis=1).copy()

    stats_100_df['winner'] = stats_100_df['winningTeam'].apply(lambda x: 1 if x == 0 else 0)
    stats_200_df['winner'] = stats_200_df['winningTeam']
    stats_100_df = stats_100_df.drop(columns=['winningTeam'])
    stats_200_df = stats_200_df.drop(columns=['winningTeam'])
    
    if players_data:
        stats_200_df.columns = [col.replace('_10', '_5').replace('_9', '_4').replace('_8', '_3').replace('_7', '_2').replace('_6', '_1') for col in stats_200_df.columns]
    else:
        stats_100_df.columns = [col.replace('_100','') for col in stats_100_df.columns]
        stats_200_df.columns = [col.replace('_200','') for col in stats_200_df.columns]
    team_performance_data = pd.concat([stats_100_df,stats_200_df])
    return team_performance_data

def get_datasets(df, key_column, validation_dataset=False):
    X = df.drop(columns=[key_column]).values
    y = df[key_column].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
    if validation_dataset:
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.50, random_state=0)
        return X_train, X_test, y_train, y_test, X_val, y_val
    return X_train, X_test, y_train, y_test

The code splits two DataFrames, norm_teams_df and norm_players_df, into training and test sets based on a 70/30 split. This is a common practice for preparing data for machine learning or statistical analysis, where one part is used to train models, and the other part is reserved for evaluating the model's performance.

In [47]:
split_index = round(norm_teams_df.index[-1]*0.7)

teams_train_df = norm_teams_df.loc[:split_index]
teams_test_df = norm_teams_df.loc[split_index + 1:]

players_train_df = norm_players_df.loc[:split_index]
players_test_df = norm_players_df.loc[split_index + 1:]

The code loads pre-trained machine learning models for team and player performance, then creates a PerformanceCalculator instance that uses these models for performance calculations.

In [19]:
teams_model = pickle.load(s3_file.open('{}/{}'.format(bucket, 'team_independent_tm_normalized_teams_stats.sav')))
players_model = pickle.load(s3_file.open('{}/{}'.format(bucket, 'team_independent_tm_normalized_player_stats.sav')))
performance_calculator = PerformanceCalculator(teams_model,players_model)

This function calculates the mean squared error (MSE) for an Elo rating system using given parameters. It processes match data from norm_teams_df and norm_players_df, computing Elo ratings for teams and players in each match. The MSE is calculated based on the predicted outcomes, comparing them to actual results. The Elo system is updated iteratively, and the MSE measures the model's accuracy. The performance_calculator is used to assess team and player performance for each match.

In [55]:
def get_elo_rating(x,norm_teams_df,norm_players_df):
    k = x[0]*10
    elo = EloRatingSystem(k_factor=k)
    y_pred = []
    for (indx1,row_team),(indx2,row_players) in zip(norm_teams_df.iterrows(),norm_players_df.iterrows()): #for indx1,row_team in norm_teams_df.iterrows():# 
        # print(row)
        winner = row_team['winningTeam']
        winner_team = row_team['teamOnlineID_100'] if winner == 0 else row_team['teamOnlineID_200']
        loser_team = row_team['teamOnlineID_100'] if winner == 1 else row_team['teamOnlineID_200']
        regex_100 = r'.*(_100|winningTeam)'
        regex_200 = r'.*(_200|winningTeam)'
        winner_data = row_team.filter(regex=regex_100).values[:-2].astype(float) if winner == 0 else row_team.filter(regex=regex_200).values[:-2].astype(float)
        # print(winner_data)
        loser_data = row_team.filter(regex=regex_100).values[:-2].astype(float) if winner == 1 else row_team.filter(regex=regex_200).values[:-2].astype(float)
        # print(loser_data)
        winning_team_performance, losing_team_performance = performance_calculator.calculate_team_performance(winner_data,loser_data,winner,x[1],x[2])

        regex_pl_100 = r'.*(_[1-5](?![0-9])|winningTeam)'
        regex_pl_200 = r'.*(_6|_7|_8|_9|_10|winningTeam)'
        winner_players_data = row_players.filter(regex=regex_pl_100).values[:-1].astype(float) if winner == 0 else row_players.filter(regex=regex_pl_200).values[:-1].astype(float)
        loser_players_data = row_players.filter(regex=regex_pl_100).values[:-1].astype(float) if winner == 1 else row_players.filter(regex=regex_pl_200).values[:-1].astype(float)
        winning_players_performance, losing_players_performance = performance_calculator.calculate_players_performance(winner_players_data,loser_players_data,winner,x[3],x[4])
        winning_side_performance = winning_team_performance + winning_players_performance
        losing_side_performance = losing_team_performance + losing_players_performance
        elo.add_player(winner_team)
        elo.add_player(loser_team)
        expected_score_winner = elo.calculate_expected_score(winner_team, loser_team)
        predicted = 1 if expected_score_winner > 0.5 else 0
        if expected_score_winner != 0.5:
            y_pred.append(predicted)
        elo.update_ratings(winner_team, loser_team, winning_side_performance, losing_side_performance)
    # print(elo.get_all_ratings())
    mse = (1/len(y_pred))*(y_pred.count(0)) if len(y_pred) > 0 else 1
    return mse

This function optimizes Elo ratings by iterating through a range of parameter sets and computing the mean squared error (MSE) for each set using the get_elo_rating function. It then returns a list of these MSE values. The objective is to find the parameter set that results in the lowest MSE, indicating the best Elo rating configuration.

In [56]:
def optimize_elo_rating(x_all,norm_teams_df,norm_players_df):
    mses = []
    for x in x_all:
        mse = get_elo_rating(x,norm_teams_df,norm_players_df)
        mses.append(mse)
    errors = np.array(mses)
    print(errors)
    return mses

Certainly. This code sets up bounds for parameter values and specifies optimization options for Particle Swarm Optimization (PSO). It then creates an optimizer and performs optimization using the optimize_elo_rating function with 100 iterations on training data for teams and players. The goal is to minimize the Mean Squared Error (MSE) and find optimal parameter values for the Elo rating system.

In [None]:
max_bound = 5 * np.ones(5)
min_bound = 0 * np.ones(5)
bounds = (min_bound, max_bound)
bounds
options = {'c1': 1.4, 'c2': 1.4, 'w': 0.7}

# Call instance of PSO with bounds argument
optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=5, options=options, bounds=bounds)

# Perform optimization
cost, pos = optimizer.optimize(optimize_elo_rating, iters=100, norm_teams_df=teams_train_df, norm_players_df=players_train_df)

The code prints and stores the optimization results, including the cost (MSE = 0.361489040) and optimized parameter values (pos = [0.3094478 , 3.58049065, 3.44078367, 3.57998156, 3.47146142] ), in a file named 'found_result_1.txt' within an Amazon S3 bucket denoted by output_bucket. This facilitates the retrieval and storage of these results for future reference or analysis.

In [None]:
found_solution = {'cost':cost,'pos':pos}
print(found_solution)

with s3_file.open('{}/{}'.format(output_bucket, 'found_result_1.txt'), 'wb') as s3_f:
    pickle.dump(found_solution, s3_f)

In [72]:
pos = [0.3094478 , 3.58049065, 3.44078367, 3.57998156, 3.47146142]

The code calculates Elo rating errors for both training and test datasets using optimized parameters, and it saves the results in an S3 bucket as 'results_1.txt'.

In [73]:
rating_error_train = get_elo_rating(pos,teams_train_df,players_train_df)
rating_error_test = get_elo_rating(pos,teams_test_df,players_test_df)

output_txt = f"""
Elo rating error (training data): {rating_error_train},
Elo rating error (test data): {rating_error_test},
"""

with s3_file.open('{}/{}'.format(output_bucket, 'results_1.txt'), 'wb') as s3_f:
    pickle.dump(output_txt, s3_f)

In [71]:
output_txt

'\nElo rating error (training data): 0.3614890400604686,\nElo rating error (test data): 0.38564422648238966,\n'