In [30]:
import pandas as pd
import numpy as np
import os

import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [31]:
# Downloads the updated csv for the 2023 season
import gdown

def update_df():
    '''
    Download most recent csv file, concat with rest of data, return the full df
    '''
    output = "lol_2023.csv" #What to save the downloaded file as
    id = "1XXk2LO0CsNADBB1LRGOV5rUpyZdEZ8s2" #The id from the google drive file
    gdown.download(id=id, output=output, quiet=False)
    
    df_2021 = pd.read_csv('lol_2021.csv')
    df_2022 = pd.read_csv('lol_2022.csv')
    df_2023 = pd.read_csv('lol_2023.csv')
    df = pd.concat([df_2021,df_2022,df_2023])
    return df

def get_wiki():
    '''
    Returns chart from wikipedia containing info for Tier 1 and Tier 2 leagues
    Returns tier1, tier2
    '''
    wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_League_of_Legends_leagues_and_tournaments')
    return wiki[1], wiki[3]

def add_opp_name(df): #tup = list of tuples
    # Create an 'opp_name' column for each row
    evens = range(0,df.shape[0],2)
    odds = range(1,df.shape[0],2)
    tup = [(a,b) for a,b in zip(evens,odds)] # list of tuples
    
    for t in tup: #iterate through list of tuples
        a,b= t #unpack each tuple into two values
        df.loc[a,'opp_name']=df.teamname.loc[b] #create new column w/opp_name
        df.loc[b,'opp_name']=df.teamname.loc[a]
    return df

def add_opp_elo(df): #tup = list of tuples
    # Create an 'opp_name' column for each row
    evens = range(0,df.shape[0],2)
    odds = range(1,df.shape[0],2)
    tup = [(a,b) for a,b in zip(evens,odds)] # list of tuples
    
    for t in tup: #iterate through list of tuples
        a,b= t #unpack each tuple into two values
        df.loc[a,'opp_name']=df.elo.loc[b] #create new column w/opp_name
        df.loc[b,'opp_name']=df.elo.loc[a]
    return df

def win_percent(elo_a,elo_b): #Calculate odds to win based off of elo rankings
    return 1/(1+10**((elo_b-elo_a)/400)) #elo_a is who you're calculating for, elo_b is opponent

def win_prob(x): # x is the American odds (-110,110,etc.) Calculates probability of winning
    if x < 0 :
        x = x*-1
        return x / (x + 100)
    else: 
        return 100 / (x + 100)

def gain_elo(elo,opp_elo,k=32): #Gain elo after a win, k=24, expected = 1
    return int(elo+k*(1-win_percent(elo,opp_elo)))

def lose_elo(elo,opp_elo,k=32): #Lose elo after a loss, k=24, expected = 0
    return int(elo+k*(0-win_percent(elo,opp_elo)))

def tie_elo(elo,opp_elo,k=32): #Lose elo after a loss, k=24, expected = .5
    return int(elo+k*(.5-win_percent(elo,opp_elo)))


def wrangle_df(df):
    leagues = ['LCK','LPL','LEC','LCS','PCS','VCS','CBLOL','LJL','LLA','UL','SL','LFL','LCO','CBLOLA'] # These are my 9 tier 1 leagues that I'll keep
    df = df[df.league.isin(leagues)] #Grab leagues of interest
    df = df[df.position=='team'] #Remove individual player stats
    
    mapping = {'Excel Esports':'Excel','EDward Gaming':'Edward Gaming','KaBuM! Esports':'KaBuM! e-Sports',
     'BISONS ECLUB':'BISONS Eclub','exeed':'Exeed','Grypciocraft Esports':'Grypciocraft',
     'Komil&amp;Friends':'Komil&Friends','IZI Dream':'Izi Dream','Team BDS Academy':'Team BDS.A',
     'FURIA Academy':'FURIA.A','Fluxo Academy':'Fluxo.A','INTZ Academy':'INTZ.A','KaBuM! Academy':'KaBuM! e-Sports.A',
     'LOUD Academy':'LOUD.A','Liberty Academy':'Liberty.A','Los Grandes Academy':'Los Grandes.A',
     'RED Academy':'RED Canids.A','Vivo Keyd Stars Academy':'Vivo Keyd Stars.A','paiN Gaming Academy':'paiN Gaming.A',
     'MAMMOTH':'Mammoth'}

    df.teamname.replace(mapping,inplace=True)
    
    df.split = df.split.str.replace('Split 1','Spring').str.replace('Split 2','Summer') #Rename 'split' names
    df.split = df.split.str.replace('Opening','Spring').str.replace('Closing','Summer')

    cols = ['teamname','league','split','date', 'side', 'gamelength','game', 'result', 'teamkills', 
            'teamdeaths', 'firstblood', 'position', 'dragons', 'barons', 'opp_barons','towers', 'opp_towers', 
            'inhibitors', 'opp_inhibitors', 'damagetochampions', 'damagetakenperminute', 'wardsplaced', 'wardskilled', 
            'controlwardsbought', 'totalgold', 'gspd'] #Columns to keep

    df = df[cols] #Remove unwanted columns
    df = df.dropna() #Drop nan values
    
    df.date = pd.to_datetime(df.date,infer_datetime_format=True) #Change to datetime object
    del df['position'] # Delete 'position' column
    df = df.sort_values('date') #Sort by date
    df = df.reset_index(drop=True) #Reset index
    df.side = np.where(df.side=='Blue',1,0) #Add 'side' column for 'blue' or 'red'
    df.rename(columns={'side':'blue_side'},inplace = True) #Change 'side' to 'blue_side'
    
    df['old_elo']=np.NaN #create new elo column
    df['new_elo']=np.NaN
    df['opp_elo']=np.NaN
    df.loc[df[~df.teamname.duplicated()].index,'old_elo']=1200 #set elo for first game to 1200 for each team

    df = add_opp_name(df) #adds opponents' name
    
    for i in range(0,df.shape[0]):
        opp_name = df.loc[i,'opp_name']
        df.loc[i,'opp_elo'] = df[(df.teamname==opp_name)&(~df.old_elo.isna())]['old_elo'].iloc[-1]
        if df.loc[i,'result'] == 1:
            df.loc[i,'new_elo'] = gain_elo(df.loc[i,'old_elo'],df.loc[i,'opp_elo'])
        else:
            df.loc[i,'new_elo'] = lose_elo(df.loc[i,'old_elo'],df.loc[i,'opp_elo'])

        team_name = df.loc[i,'teamname']
        try:
            next_game_index = df[(df.teamname==team_name)&df.old_elo.isna()]['old_elo'].index[0]
            df.loc[next_game_index,'old_elo'] = df[(df.teamname==team_name)&(~df.new_elo.isna())]['new_elo'].iloc[-1]
        except:
            pass
    
    df.to_csv('final.csv') #Save to csv file
    
    return df

    

# Update and wrangle professional games for Spring and Summer splits

In [None]:
df = update_df()
df.to_csv('raw.csv')
df = wrangle_df(df)

Downloading...
From: https://drive.google.com/uc?id=1XXk2LO0CsNADBB1LRGOV5rUpyZdEZ8s2
To: /Users/thegootch/codeup-data-science/league/lol_2023.csv
100%|██████████████████████████████████████| 37.4M/37.4M [00:03<00:00, 10.3MB/s]
  df = update_df()


In [None]:
df[['teamname','league','date','result','opp_name']].tail()

# Sports Betting Section

In [5]:
def find_differing_values(series1,series2): #Takes 2 pd Series with string values and returns values that aren't in both
    # Find values in series1 but not in series2
    values_in_series1 = series1[~series1.isin(series2)]

    # Find values in series2 but not in series1
    values_in_series2 = series2[~series2.isin(series1)]

    # Print the results
    print("Values in series1 but not in series2:")
    print(values_in_series1)

    print("Values in series2 but not in series1:")
    print(values_in_series2)

def get_league(df, league_name): #Returns a league ("LCS,LPL,etc.") sorted by latest elo
    '''
    pass in 2 parameters:
    df, league_name
    '''
    return df[df.league==league_name].sort_values('new_elo',ascending=False)

def get_team(df, team,how_many):
    '''
    pass in 3 parameters:
    df, teamname, how many results you want
    '''
    return df[df.teamname==team][['teamname','opp_name','date','result','old_elo','opp_elo','new_elo']].\
sort_values(by='date',ascending = False).head(how_many)

def single_game_odds(df, teams, opponents, bet_odds):
    """
    Returns a DataFrame with 'home' and 'away' teams with elos and implied odds.
    """
    team_list = []
    for team, opponent, odds in zip(teams, opponents, bet_odds):
        row1 = df[df.teamname==team][['teamname', 'new_elo']]
        row1['odds'] = odds[0]
        row2 = df[df.teamname==opponent][['teamname', 'new_elo']]
        row2['odds'] = odds[1]
        row3 = pd.concat([row1, row2])
        row3.columns = ['teamname', 'elo', 'odds']
        row3['opponent'] = [row2.teamname.iloc[0], row1.teamname.iloc[0]]
        row3['next_opp_elo'] = [row2.new_elo.iloc[0], row1.new_elo.iloc[0]]
        team_list.append(row3)
    temp = pd.concat(team_list)
    temp = temp[['teamname', 'elo', 'opponent', 'next_opp_elo', 'odds']]
    temp['implied_odds'] = temp.odds.apply(win_prob)
    temp['elo_odds'] = temp.apply(lambda row: win_percent(row['elo'], row['next_opp_elo']), axis=1)
    temp['odds_diff'] = temp.elo_odds - temp.implied_odds
    temp.sort_values('odds_diff', ascending=False, inplace=True)
    return temp.reset_index(drop=True)

import math

def series_3(probability):
    num_wins_required = 2
    num_games_required = (num_wins_required * 2) - 1

    # Calculate the probability of winning a single game
    p_win = probability

    # Calculate the probability of losing a single game
    p_loss = 1 - p_win

    # Calculate the odds of winning a best-of-3 series
    odds = 0

    for wins in range(num_wins_required, num_games_required + 1):
        # Calculate the number of combinations to achieve the current number of wins
        combinations = math.comb(num_games_required, wins)

        # Calculate the probability of achieving the current number of wins
        p_current_wins = p_win ** wins * p_loss ** (num_games_required - wins)

        odds += combinations * p_current_wins

    return odds


def series_5(probability):
    num_wins_required = 3
    num_games_required = (num_wins_required * 2) - 1

    # Calculate the probability of winning a single game
    p_win = probability

    # Calculate the probability of losing a single game
    p_loss = 1 - p_win

    # Calculate the odds of winning a best-of-5 series
    odds = 0

    for wins in range(num_wins_required, num_games_required + 1):
        # Calculate the number of combinations to achieve the current number of wins
        combinations = math.comb(num_games_required, wins)

        # Calculate the probability of achieving the current number of wins
        p_current_wins = p_win ** wins * p_loss ** (num_games_required - wins)

        odds += combinations * p_current_wins

    return odds


def best_of_3_odds(df, teams, opponents, bet_odds):
    """
    Returns a DataFrame with 'home' and 'away' teams with elos and implied odds.
    """
    team_list = []
    for team, opponent, odds in zip(teams, opponents, bet_odds):
        row1 = df[df.teamname==team][['teamname', 'new_elo']]
        row1['odds'] = odds[0]
        row2 = df[df.teamname==opponent][['teamname', 'new_elo']]
        row2['odds'] = odds[1]
        row3 = pd.concat([row1, row2])
        row3.columns = ['teamname', 'elo', 'odds']
        row3['opponent'] = [row2.teamname.iloc[0], row1.teamname.iloc[0]]
        row3['next_opp_elo'] = [row2.new_elo.iloc[0], row1.new_elo.iloc[0]]
        team_list.append(row3)
    temp = pd.concat(team_list)
    temp = temp[['teamname', 'elo', 'opponent', 'next_opp_elo', 'odds']]
    temp['implied_odds'] = temp.odds.apply(win_prob)
    temp['elo_odds'] = temp.apply(lambda row: win_percent(row['elo'], row['next_opp_elo']), axis=1)
    temp['series_odds'] = temp.apply(lambda row: series_3(row['elo_odds']), axis=1)
    temp['odds_diff'] = temp.series_odds - temp.implied_odds
    temp.sort_values('odds_diff', ascending=False, inplace=True)
    return temp.reset_index(drop=True)

In [6]:
def calc_odds_diff(df):
    df['implied_odds'] = df.odds.apply(win_prob)
    df['elo_odds'] = df.apply(lambda row: win_percent(row['new_elo'], row['opp_elo']), axis=1)
    df['series_odds_3'] = df.apply(lambda row: series_3(row['elo_odds']), axis=1)
    df['series_odds_5'] = df.apply(lambda row: series_5(row['elo_odds']), axis=1)
    df['odds_diff'] = df.elo_odds - df.implied_odds
    df['odds_diff_3'] = df.series_odds_3 - df.implied_odds
    df['odds_diff_5'] = df.series_odds_5 - df.implied_odds
    return df

In [7]:
#Read in df and create 'current_elo' df
df = pd.read_csv('final.csv',index_col=0) 
raw = pd.read_csv('raw.csv',index_col=0)

#'current_elo' contains every team and their latest elo
current_elo = df[~df.teamname.duplicated(keep='last')]\
[['teamname','league','opp_name','old_elo','opp_elo','new_elo']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
get_league(current_elo,"LFL")

Unnamed: 0,teamname,league,opp_name,old_elo,opp_elo,new_elo
20136,Aegis,LFL,Vitality.Bee,1305.0,1212.0,1316.0
20133,LDLC OL,LFL,Karmine Corp,1318.0,1173.0,1295.0
20129,Team GO,LFL,Team BDS.A,1271.0,1252.0,1286.0
20128,Team BDS.A,LFL,Team GO,1252.0,1271.0,1236.0
20124,BK ROG Esports,LFL,Izi Dream,1208.0,1018.0,1216.0
20137,Vitality.Bee,LFL,Aegis,1212.0,1305.0,1200.0
20132,Karmine Corp,LFL,LDLC OL,1173.0,1318.0,1195.0
14416,Misfits Premier,LFL,Vitality.Bee,1204.0,1295.0,1192.0
20140,Solary,LFL,GameWard,1157.0,1153.0,1172.0
20141,GameWard,LFL,Solary,1153.0,1157.0,1137.0


In [9]:
# #Input home and away teams with their odds and get back a df with the difference between betting odds and elo odds
# home = ['Team GO','Vitality.Bee','Karmine Corp','Aegis','IZI Dream']
# away = ['LDLC OL','BK ROG Esports','Team BDS Academy','Solary','GameWard']
# odds = [(110,-150),(-275,200),(-188,133),(-125,-110),(150,-200)]
# single = single_game_odds(current_elo,home,away,odds)
# # series_odds = [(-250,175),(-163,120),(333,-500),(500,-900),(175,-250)]
# # series = best_of_3_odds(current_elo,home,away,series_odds)

In [10]:
get_team(df,'Rare Atom',4)

Unnamed: 0,teamname,opp_name,date,result,old_elo,opp_elo,new_elo
20234,Rare Atom,LGD Gaming,2023-06-11 10:58:51,1,1012.0,981.0,1026.0
20231,Rare Atom,LGD Gaming,2023-06-11 10:07:17,1,997.0,981.0,1012.0
20224,Rare Atom,LGD Gaming,2023-06-11 09:15:39,0,1015.0,981.0,997.0
20104,Rare Atom,FunPlus Phoenix,2023-06-08 08:03:46,1,1000.0,994.0,1015.0


# Open html file and parse with BeautifulSoup


In [18]:
import pandas as pd
from bs4 import BeautifulSoup
df = pd.read_csv('final.csv')

CBLOL - https://www.co.bet365.com/#/AC/B151/C20889769/D48/E1510001/F10/  
CBLOLA - https://www.co.bet365.com/#/AC/B151/C20890093/D48/E1510001/F10/  
LCK - https://www.co.bet365.com/#/AC/B151/C20889854/D48/E1510001/F10/  
LCO - https://www.co.bet365.com/#/AC/B151/C20890087/D48/E1510001/F10/  
LEC - https://www.co.bet365.com/#/AC/B151/C20890316/D48/E1510001/F10/  
LFL - https://www.co.bet365.com/#/AC/B151/C20890217/D48/E1510001/F10/  
LPL - https://www.co.bet365.com/#/AC/B151/C20889805/D48/E1510001/F10/  
SL - https://www.co.bet365.com/#/AC/B151/C20890120/D48/E1510001/F10/   
UL - https://www.co.bet365.com/?_h=stk6mbH5dnUh1pzKwtn2RQ%3D%3D#/AC/B151/C20889870/D48/E1510001/F10/
LCS - 
VCS, LJL, LLA, LCO



In [13]:
#Parses html files for each league to pull updated odds from bet365
html_files =['lec.html','cblola.html','cblol.html','lck.html','lco.html','lfl.html','lpl.html','sl.html','ul.html']
league_name = ['lec','cblola','cblol','lck','lco','lfl','lpl','sl','ul']

temp = []
for html, name in zip(html_files,league_name):
    file_path = html  # Specify the path to the HTML file in the local directory

    # Read the HTML content from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html,'html.parser')
    
    # Get list of teams
    team_html = soup.select('div.ses-ParticipantFixtureDetailsHigherEsports_Team')
    team_list = [i.text for i in team_html]

    # Get list of odds for each team
    span_element = soup.find_all('span', class_='src-ParticipantOddsOnly50_Odds')
    odds_list = [int(i.text) for i in span_element]
    
    df = pd.DataFrame({'teamname':team_list,'odds':odds_list})
    df = add_opp_name(df)
    df['league'] = name
    temp.append(df)
    
bet = pd.concat(temp)

In [14]:
current_elo.head()

Unnamed: 0,teamname,league,opp_name,old_elo,opp_elo,new_elo
2057,7more7 Pompa Team,UL,K1CK,1240.0,1191.0,1221.0
2688,Cruzeiro eSports,CBLOL,KaBuM! e-Sports,1116.0,1205.0,1136.0
2795,Cruzeiro Academy,CBLOLA,KaBuM! e-Sports.A,1101.0,1139.0,1086.0
3105,eStar,LPL,Suning,985.0,1272.0,979.0
5842,PDW,UL,Komil&Friends,1210.0,1058.0,1219.0


In [15]:
# Create df with implied odds, elo odds, and the difference for all upcoming games
temp = pd.merge(bet,current_elo[['teamname','new_elo']],on='teamname')
monies = pd.merge(temp,current_elo[['teamname','new_elo']],left_on='opp_name',right_on='teamname')
del monies['teamname_y']
monies.columns = ['teamname','odds','opp_name','league','new_elo','opp_elo']
monies = calc_odds_diff(monies)
monies1 =monies[['teamname','league','odds','opp_name','new_elo','opp_elo','odds_diff']].sort_values(['league','odds_diff'],ascending=False)
monies1 = monies1[monies1.odds_diff>.08]
monies3 =monies[['teamname','league','odds','opp_name','new_elo','opp_elo','odds_diff_3']].sort_values(['league','odds_diff_3'],ascending=False)
monies3 = monies3[monies3.odds_diff_3>.08]

In [29]:
get_team(df,"Anyone's Legend",20)

Unnamed: 0,teamname,opp_name,date,result,old_elo,opp_elo,new_elo
20147,Anyone's Legend,Ninjas in Pyjamas,2023-06-09 08:18:50,0,957.0,1051.0,945.0
20142,Anyone's Legend,Ninjas in Pyjamas,2023-06-09 07:18:29,0,970.0,1039.0,957.0
20003,Anyone's Legend,ThunderTalk Gaming,2023-06-05 13:02:30,1,952.0,1012.0,970.0
20000,Anyone's Legend,ThunderTalk Gaming,2023-06-05 12:04:43,1,931.0,1053.0,952.0
19997,Anyone's Legend,ThunderTalk Gaming,2023-06-05 11:15:54,0,943.0,1053.0,931.0
19940,Anyone's Legend,Team WE,2023-06-02 08:18:30,0,955.0,1066.0,943.0
19938,Anyone's Legend,Team WE,2023-06-02 07:20:38,0,968.0,1055.0,955.0
19826,Anyone's Legend,Ultra Prime,2023-05-29 11:01:22,0,988.0,900.0,968.0
19824,Anyone's Legend,Ultra Prime,2023-05-29 10:16:12,0,1010.0,880.0,988.0
19823,Anyone's Legend,Ultra Prime,2023-05-29 09:20:41,1,1000.0,880.0,1010.0


In [24]:
monies1[~(monies1.league=='lck')&~(monies1.league=='lpl')]

Unnamed: 0,teamname,league,odds,opp_name,new_elo,opp_elo,odds_diff
158,Grypciocraft,ul,333,Alior Bank Team,1199.0,1160.0,0.324944
172,Illuminar Gaming,ul,162,Orbit Anonymo,1191.0,1205.0,0.098184
160,Grypciocraft,ul,500,Zero Tenacity,1199.0,1379.0,0.095224
154,Giants,sl,137,Barça eSports,1212.0,1206.0,0.086693
113,Aegis,lfl,150,Team GO,1316.0,1286.0,0.143066
103,Solary,lfl,-138,Izi Dream,1172.0,1009.0,0.138925
117,GameWard,lfl,250,Karmine Corp,1137.0,1195.0,0.131584
102,Team BDS.A,lfl,-200,Izi Dream,1236.0,1009.0,0.120297
116,Vitality.Bee,lfl,150,Karmine Corp,1200.0,1195.0,0.107195
1,Team Heretics,lec,250,Team Vitality,1113.0,1161.0,0.145644


In [25]:
monies3[(monies3.league=='lck')|(monies3.league=='lpl')]

Unnamed: 0,teamname,league,odds,opp_name,new_elo,opp_elo,odds_diff_3
88,OKSavingsBank BRION,lck,700,Dplus KIA,1152.0,1161.0,0.355581
87,OKSavingsBank BRION,lck,800,T1,1152.0,1309.0,0.090287


In [28]:
monies[monies.league=='lpl'].sort_values(['league','odds_diff_3'],ascending=False)

Unnamed: 0,teamname,odds,opp_name,league,new_elo,opp_elo,implied_odds,elo_odds,series_odds_3,series_odds_5,odds_diff,odds_diff_3,odds_diff_5
143,Rare Atom,600,LNG Esports,lpl,1026.0,1179.0,0.142857,0.293025,0.20727,0.153975,0.150168,0.064413,0.011118
127,JD Gaming,-1600,Anyone's Legend,lpl,1350.0,945.0,0.941176,0.911442,0.977861,0.993945,-0.029735,0.036685,0.052768
123,Bilibili Gaming,-700,ThunderTalk Gaming,lpl,1269.0,1013.0,0.875,0.813612,0.908729,0.952002,-0.061388,0.033729,0.077002
131,JD Gaming,-700,Royal Never Give Up,lpl,1350.0,1109.0,0.875,0.800162,0.896156,0.942204,-0.074838,0.021156,0.067204
133,Royal Never Give Up,-275,FunPlus Phoenix,lpl,1109.0,985.0,0.733333,0.671241,0.746818,0.796853,-0.062093,0.013485,0.063519
126,Bilibili Gaming,-1600,Anyone's Legend,lpl,1269.0,945.0,0.941176,0.86589,0.950867,0.980471,-0.075287,0.009691,0.039295
119,Ninjas in Pyjamas,-334,Ultra Prime,lpl,1062.0,928.0,0.769585,0.683816,0.763303,0.81486,-0.085769,-0.006283,0.045275
125,Oh My God,-275,Ninjas in Pyjamas,lpl,1174.0,1062.0,0.733333,0.655821,0.726164,0.773798,-0.077513,-0.007169,0.040465
134,Top Esports,-800,FunPlus Phoenix,lpl,1202.0,985.0,0.888889,0.777153,0.873151,0.923028,-0.111736,-0.015738,0.034139
138,LGD Gaming,700,Top Esports,lpl,966.0,1202.0,0.125,0.20448,0.108337,0.061419,0.07948,-0.016663,-0.063581


# Don't go past here yet

In [16]:
numerical = train.select_dtypes(['int','float']).columns

NameError: name 'train' is not defined

In [None]:
def create_target(groupby):
    groupby['target']=groupby['result'].shift(-1)
    return groupby

In [None]:
def add_target(df):
    df = df.groupby('teamname').apply(create_target)
    df.loc[pd.isnull(df.target),'target'] =2
    df.target = df.target.astype(int,errors='ignore')
    return df

In [None]:
df = add_target(df)

In [None]:
from sklearn.preprocessing import MinMaxScaler #scale all numerical columns

removed_columns = ['teamname','league','date','target','opp_name']
selected_columns = df.columns[~df.columns.isin(removed_columns)]

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])


In [None]:
# Create rolling averages for columns, concat as new columns to df


def rolling(team):
    rolling = team.rolling(10).mean()
    return rolling

def add_rolling(df):
    cols = ['gamelength','teamkills','teamdeaths','firstblood','dragons','barons','opp_barons','towers','opp_towers',\
       'inhibitors','opp_inhibitors','damagetochampions','damagetakenperminute','wardsplaced','wardskilled',\
       'controlwardsbought','totalgold','gspd']

    df_rolling=df[list(cols)+['teamname']]
    
    
    df_rolling = df_rolling.groupby('teamname',group_keys=False)[cols].apply(rolling)

    rolling_cols = [f'{col}_rolling' for col in df_rolling.columns]
    df_rolling.columns = rolling_cols
    df = pd.concat([df,df_rolling],axis=1)
    return df.dropna()

In [None]:
def next_opp(team):
    team['next_opp'] = team['opp_name'].shift(-1)
    return team
def add_opp(df):
    df = df.groupby('teamname').apply(next_opp)
    df.loc[df.next_opp.isnull(),'next_opp'] = 2
    return df

In [None]:
add_opp(df)

In [None]:
def next_side(team):
    team['next_blue'] = team['blue_side'].shift(-1)
    return team

def add_next_side(df):
    df = df.groupby('teamname').apply(next_side)
    df.loc[df.next_blue.isnull(),'next_blue']=2
    df.next_blue = df.next_blue.astype(int,errors='ignore')
    return df

In [None]:
def next_date(team):
    team['next_date'] = team['date'].shift(-1)
    return team

def add_next_date(df):
    df = df.groupby('teamname').apply(next_date)
    df.loc[df.next_date.isnull(),'next_date']=2
    return df

In [None]:
full = df.merge(df[rolling_cols + ["next_opp", "next_date", "teamname"]], left_on=["teamname", "next_date"], \
                right_on=["next_opp", "next_date"])

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

def create_objects():
    rr = RidgeClassifier(solver ='sag',normalize=False,)
    split = TimeSeriesSplit(n_splits=3)
    sfs = SequentialFeatureSelector(rr, n_features_to_select=14,direction='backward',cv=split,n_jobs=-1)

create_objects()

In [None]:
removed_columns = list(full.columns[full.dtypes=='object']) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [None]:
sfs.fit(full[selected_columns],full['target'])

In [None]:
selectors = selected_columns[sfs.get_support()]

In [None]:
predictions = backtest(full,rr,selectors,'target')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions.actual,predictions.prediction)

.5665 'forward', rr

In [None]:
def near_split(x, num_bins): #Split my df into equal splits to perform backtesting
    quotient, remainder = divmod(x, num_bins)
    bins = [quotient + 1] * remainder + [quotient] * (num_bins - remainder)
    count = 0
    new_list = []
    for b in bins:
        count += b
        new_list.append(count)
    return new_list

splits = near_split(df.shape[0],5)
last_split = splits[4]-splits[3] #Difference between last two values for final 'test' set

In [None]:
def backtest(data,model,predictors,target):
    all_predictions= []
    
    for i in range(0,len(splits)-1):
        train = data.loc[:splits[i]]
        test = data.loc[splits[i]:splits[i]+last_split]
        
        model.fit(train[predictors],train[target])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds,index=test.index)
        combined = pd.concat([test[target],preds],axis=1)
        combined.columns = ['actual','prediction']
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions)
        
        
        
        


In [None]:
predictions = backtest(full,rr,selectors,'target')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions.actual,predictions.prediction)

### optimize ridge regression

In [None]:
from sklearn import decomposition
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

In [None]:
X = full[selectors]
y = full['target']

In [None]:
pca = decomposition.PCA()
ridge = linear_model.Ridge()

In [None]:
pipe = Pipeline(steps=[("pca", pca),
                        ("ridge", ridge)])

In [None]:
n_components = list(range(1,X.shape[1]+1,1))
normalize = [True, False]
solver = ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
parameters = dict(pca__n_components=n_components,
                      ridge__normalize=normalize,
                      ridge__solver=solver)

In [None]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X, y)

In [None]:
clf_GS.best_params_

In [None]:
print("Best Number Of Components:", clf_GS.best_estimator_.get_params()["pca__n_components"])
print(); print(clf_GS.best_estimator_.get_params()["ridge"])

