In [191]:
import pandas as pd
import numpy as np
import os

import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [167]:
# Downloads the updated csv for the 2023 season
import gdown

def update_df():
    '''
    Download most recent csv file, concat with rest of data, return the full df
    '''
    output = "lol_2023.csv" #What to save the downloaded file as
    id = "1XXk2LO0CsNADBB1LRGOV5rUpyZdEZ8s2" #The id from the google drive file
    gdown.download(id=id, output=output, quiet=False)
    
    df_2021 = pd.read_csv('lol_2021.csv')
    df_2022 = pd.read_csv('lol_2022.csv')
    df_2023 = pd.read_csv('lol_2023.csv')
    df = pd.concat([df_2021,df_2022,df_2023])
    return df

def get_wiki():
    '''
    Returns chart from wikipedia containing info for Tier 1 and Tier 2 leagues
    Returns tier1, tier2
    '''
    wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_League_of_Legends_leagues_and_tournaments')
    return wiki[1], wiki[3]

def add_opp_name(df): #tup = list of tuples
    # Create an 'opp_name' column for each row
    evens = range(0,df.shape[0],2)
    odds = range(1,df.shape[0],2)
    tup = [(a,b) for a,b in zip(evens,odds)] # list of tuples
    
    for t in tup: #iterate through list of tuples
        a,b= t #unpack each tuple into two values
        df.loc[a,'opp_name']=df.teamname.loc[b] #create new column w/opp_name
        df.loc[b,'opp_name']=df.teamname.loc[a]
    return df

def win_percent(elo_a,elo_b): #Calculate odds to win based off of elo rankings
    return 1/(1+10**((elo_b-elo_a)/400)) #elo_a is who you're calculating for, elo_b is opponent

def win_prob(x): # x is the American odds (-110,110,etc.) Calculates probability of winning
    if x < 0 :
        x = x*-1
        return x / (x + 100)
    else: 
        return 100 / (x + 100)

def gain_elo(elo,opp_elo,k=32): #Gain elo after a win, k=24, expected = 1
    return int(elo+k*(1-win_percent(elo,opp_elo)))

def lose_elo(elo,opp_elo,k=32): #Lose elo after a loss, k=24, expected = 0
    return int(elo+k*(0-win_percent(elo,opp_elo)))

def tie_elo(elo,opp_elo,k=32): #Lose elo after a loss, k=24, expected = .5
    return int(elo+k*(.5-win_percent(elo,opp_elo)))


def wrangle_df(df):
    leagues = ['LCK','LPL','LEC','LCS','PCS','VCS','CBLOL','LJL','LLA','UL','SL','LFL','LCO','CBLOLA'] # These are my 9 tier 1 leagues that I'll keep
    df = df[df.league.isin(leagues)] #Filter out non Tier-1 leagues
    df = df[df.position=='team'] #Remove individual player stats
    
    df.split = df.split.str.replace('Split 1','Spring').str.replace('Split 2','Summer') #Rename 'split' names
    df.split = df.split.str.replace('Opening','Spring').str.replace('Closing','Summer')

    cols = ['teamname','league','split','date', 'side', 'gamelength','game', 'result', 'teamkills', 
            'teamdeaths', 'firstblood', 'position', 'dragons', 'barons', 'opp_barons','towers', 'opp_towers', 
            'inhibitors', 'opp_inhibitors', 'damagetochampions', 'damagetakenperminute', 'wardsplaced', 'wardskilled', 
            'controlwardsbought', 'totalgold', 'gspd'] #Columns to keep

    df = df[cols] #Remove unwanted columns
    df = df.dropna() #Drop nan values
    
    df.date = pd.to_datetime(df.date,infer_datetime_format=True) #Change to datetime object
    del df['position'] # Delete 'position' column
    df = df.sort_values('date') #Sort by date
    df = df.reset_index(drop=True) #Reset index
    df.side = np.where(df.side=='Blue',1,0) #Add 'side' column for 'blue' or 'red'
    df.rename(columns={'side':'blue_side'},inplace = True) #Change 'side' to 'blue_side'
    
    df['old_elo']=np.NaN #create new elo column
    df['new_elo']=np.NaN
    df['opp_elo']=np.NaN
    df.loc[df[~df.teamname.duplicated()].index,'old_elo']=1200 #set elo for first game to 1200 for each team

    df = add_opp_name(df) #adds opponents' name
    
    for i in range(0,df.shape[0]):
        opp_name = df.loc[i,'opp_name']
        df.loc[i,'opp_elo'] = df[(df.teamname==opp_name)&(~df.old_elo.isna())]['old_elo'].iloc[-1]
        if df.loc[i,'result'] == 1:
            df.loc[i,'new_elo'] = gain_elo(df.loc[i,'old_elo'],df.loc[i,'opp_elo'])
        else:
            df.loc[i,'new_elo'] = lose_elo(df.loc[i,'old_elo'],df.loc[i,'opp_elo'])

        team_name = df.loc[i,'teamname']
        try:
            next_game_index = df[(df.teamname==team_name)&df.old_elo.isna()]['old_elo'].index[0]
            df.loc[next_game_index,'old_elo'] = df[(df.teamname==team_name)&(~df.new_elo.isna())]['new_elo'].iloc[-1]
        except:
            pass
    
    df.to_csv('final.csv') #Save to csv file
    
    return df

    

# Update and wrangle professional games for Spring and Summer splits

In [168]:
df = update_df()
df.to_csv('raw.csv')
df = wrangle_df(df)

Downloading...
From: https://drive.google.com/uc?id=1XXk2LO0CsNADBB1LRGOV5rUpyZdEZ8s2
To: /Users/thegootch/codeup-data-science/league/lol_2023.csv
100%|██████████████████████████████████████| 34.1M/34.1M [00:06<00:00, 5.57MB/s]
  df = update_df()


In [169]:
df.tail()

Unnamed: 0,teamname,league,split,date,blue_side,gamelength,game,result,teamkills,teamdeaths,firstblood,dragons,barons,opp_barons,towers,opp_towers,inhibitors,opp_inhibitors,damagetochampions,damagetakenperminute,wardsplaced,wardskilled,controlwardsbought,totalgold,gspd,old_elo,new_elo,opp_elo,opp_name
19847,Los Heretics,SL,Summer,2023-05-29 19:02:26,0,1883,1.0,1,20,13,1.0,4.0,2.0,0.0,9.0,3.0,2.0,0.0,79420.0,2828.4121,82.0,46.0,33.0,62736,0.099598,1260.0,1279.0,1331.0,Movistar Riders
19848,Team ESCA Gaming,UL,Summer,2023-05-29 19:23:19,1,1701,1.0,0,3,11,0.0,1.0,0.0,2.0,1.0,11.0,0.0,1.0,39415.0,2887.9718,99.0,36.0,25.0,43288,-0.163263,1056.0,1046.0,1196.0,Illuminar Gaming
19849,Illuminar Gaming,UL,Summer,2023-05-29 19:23:19,0,1701,1.0,1,11,3,1.0,3.0,2.0,0.0,11.0,1.0,1.0,0.0,63789.0,2425.2557,83.0,44.0,27.0,55731,0.163263,1196.0,1205.0,1056.0,Team ESCA Gaming
19850,Giants,SL,Summer,2023-05-29 20:00:17,1,2042,1.0,1,22,3,1.0,4.0,1.0,0.0,11.0,1.0,3.0,0.0,96098.0,2368.2958,91.0,35.0,35.0,67497,0.231077,1207.0,1225.0,1259.0,Fnatic TQ
19851,Fnatic TQ,SL,Summer,2023-05-29 20:00:17,0,2042,1.0,0,3,22,0.0,1.0,0.0,1.0,1.0,11.0,0.0,3.0,45895.0,3564.7405,88.0,35.0,27.0,51651,-0.231077,1259.0,1240.0,1207.0,Giants


# Sports Betting Section

In [170]:
def get_league(team): #Returns a league ("LCS,LPL,etc.") sorted by latest elo
    return current_elo[current_elo.league==team].sort_values('new_elo',ascending=False)

def get_team(df, team):
    return df[df.teamname==team].sort_values(by='date',ascending = False)

def single_game_odds(df, teams, opponents, bet_odds):
    """
    Returns a DataFrame with 'home' and 'away' teams with elos and implied odds.
    """
    team_list = []
    for team, opponent, odds in zip(teams, opponents, bet_odds):
        row1 = df[df.teamname==team][['teamname', 'new_elo']]
        row1['odds'] = odds[0]
        row2 = df[df.teamname==opponent][['teamname', 'new_elo']]
        row2['odds'] = odds[1]
        row3 = pd.concat([row1, row2])
        row3.columns = ['teamname', 'elo', 'odds']
        row3['opponent'] = [row2.teamname.iloc[0], row1.teamname.iloc[0]]
        row3['next_opp_elo'] = [row2.new_elo.iloc[0], row1.new_elo.iloc[0]]
        team_list.append(row3)
    temp = pd.concat(team_list)
    temp = temp[['teamname', 'elo', 'opponent', 'next_opp_elo', 'odds']]
    temp['implied_odds'] = temp.odds.apply(win_prob)
    temp['elo_odds'] = temp.apply(lambda row: win_percent(row['elo'], row['next_opp_elo']), axis=1)
    temp['odds_diff'] = temp.elo_odds - temp.implied_odds
    temp.sort_values('odds_diff', ascending=False, inplace=True)
    return temp.reset_index(drop=True)

import math

def series_3(probability):
    num_wins_required = 2
    num_games_required = (num_wins_required * 2) - 1

    # Calculate the probability of winning a single game
    p_win = probability

    # Calculate the probability of losing a single game
    p_loss = 1 - p_win

    # Calculate the odds of winning a best-of-3 series
    odds = 0

    for wins in range(num_wins_required, num_games_required + 1):
        # Calculate the number of combinations to achieve the current number of wins
        combinations = math.comb(num_games_required, wins)

        # Calculate the probability of achieving the current number of wins
        p_current_wins = p_win ** wins * p_loss ** (num_games_required - wins)

        odds += combinations * p_current_wins

    return odds


def series_5(probability):
    num_wins_required = 3
    num_games_required = (num_wins_required * 2) - 1

    # Calculate the probability of winning a single game
    p_win = probability

    # Calculate the probability of losing a single game
    p_loss = 1 - p_win

    # Calculate the odds of winning a best-of-5 series
    odds = 0

    for wins in range(num_wins_required, num_games_required + 1):
        # Calculate the number of combinations to achieve the current number of wins
        combinations = math.comb(num_games_required, wins)

        # Calculate the probability of achieving the current number of wins
        p_current_wins = p_win ** wins * p_loss ** (num_games_required - wins)

        odds += combinations * p_current_wins

    return odds


def best_of_3_odds(df, teams, opponents, bet_odds):
    """
    Returns a DataFrame with 'home' and 'away' teams with elos and implied odds.
    """
    team_list = []
    for team, opponent, odds in zip(teams, opponents, bet_odds):
        row1 = df[df.teamname==team][['teamname', 'new_elo']]
        row1['odds'] = odds[0]
        row2 = df[df.teamname==opponent][['teamname', 'new_elo']]
        row2['odds'] = odds[1]
        row3 = pd.concat([row1, row2])
        row3.columns = ['teamname', 'elo', 'odds']
        row3['opponent'] = [row2.teamname.iloc[0], row1.teamname.iloc[0]]
        row3['next_opp_elo'] = [row2.new_elo.iloc[0], row1.new_elo.iloc[0]]
        team_list.append(row3)
    temp = pd.concat(team_list)
    temp = temp[['teamname', 'elo', 'opponent', 'next_opp_elo', 'odds']]
    temp['implied_odds'] = temp.odds.apply(win_prob)
    temp['elo_odds'] = temp.apply(lambda row: win_percent(row['elo'], row['next_opp_elo']), axis=1)
    temp['series_odds'] = temp.apply(lambda row: series_3(row['elo_odds']), axis=1)
    temp['odds_diff'] = temp.series_odds - temp.implied_odds
    temp.sort_values('odds_diff', ascending=False, inplace=True)
    return temp.reset_index(drop=True)

In [171]:
#Read in df and create 'current_elo' df
df = pd.read_csv('final.csv',index_col=0) 
raw = pd.read_csv('raw.csv',index_col=0)

#'current_elo' contains every team and their latest elo
current_elo = df[~df.teamname.duplicated(keep='last')]\
[['teamname','league','opp_name','old_elo','opp_elo','new_elo']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [172]:
get_team(df,"FunPlus Phoenix")

Unnamed: 0,teamname,league,split,date,blue_side,gamelength,game,result,teamkills,teamdeaths,firstblood,dragons,barons,opp_barons,towers,opp_towers,inhibitors,opp_inhibitors,damagetochampions,damagetakenperminute,wardsplaced,wardskilled,controlwardsbought,totalgold,gspd,old_elo,new_elo,opp_elo,opp_name
19830,FunPlus Phoenix,LPL,Summer,2023-05-29 13:08:22,1,1474,2.0,1,5,4,0.0,0.0,1.0,0.0,8.0,2.0,1.0,0.0,40663.0,2593.1479,79.0,30.0,31.0,44565,0.030699,1039.0,1055.0,1051.0,ThunderTalk Gaming
19828,FunPlus Phoenix,LPL,Summer,2023-05-29 12:20:29,1,1717,1.0,1,20,5,1.0,3.0,1.0,0.0,7.0,3.0,1.0,0.0,71318.0,2858.1596,81.0,42.0,37.0,56630,0.108616,1021.0,1039.0,1069.0,ThunderTalk Gaming
18979,FunPlus Phoenix,LPL,Spring,2023-03-23 13:00:33,0,2304,2.0,0,14,21,0.0,1.0,1.0,0.0,3.0,8.0,0.0,1.0,86278.0,3757.4740,117.0,61.0,47.0,66084,-0.043261,1032.0,1021.0,1164.0,Top Esports
18975,FunPlus Phoenix,LPL,Spring,2023-03-23 12:07:03,1,2033,1.0,0,6,16,0.0,1.0,0.0,1.0,2.0,9.0,0.0,1.0,47774.0,2887.4373,116.0,48.0,40.0,55100,-0.111945,1044.0,1032.0,1154.0,Top Esports
18816,FunPlus Phoenix,LPL,Spring,2023-03-19 11:04:43,1,1652,3.0,1,20,4,1.0,3.0,0.0,1.0,10.0,2.0,2.0,0.0,51322.0,2271.9734,84.0,36.0,33.0,55614,0.174932,1019.0,1044.0,1271.0,EDward Gaming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,FunPlus Phoenix,LPL,Spring,2021-01-16 13:03:57,0,2090,3.0,0,8,10,1.0,2.0,0.0,1.0,3.0,8.0,0.0,1.0,46143.0,2614.3062,127.0,42.0,52.0,57924,-0.029732,1226.0,1210.0,1226.0,EDward Gaming
116,FunPlus Phoenix,LPL,Spring,2021-01-16 12:09:11,1,2132,2.0,1,17,10,0.0,3.0,1.0,1.0,8.0,3.0,1.0,0.0,87172.0,2587.2326,145.0,45.0,63.0,65475,0.007791,1210.0,1226.0,1227.0,EDward Gaming
112,FunPlus Phoenix,LPL,Spring,2021-01-16 11:15:50,0,2041,1.0,0,2,11,0.0,1.0,1.0,0.0,3.0,7.0,0.0,1.0,54330.0,2225.0857,134.0,67.0,56.0,54841,-0.020819,1227.0,1210.0,1212.0,EDward Gaming
28,FunPlus Phoenix,LPL,Spring,2021-01-12 10:00:34,1,1850,2.0,1,19,11,1.0,4.0,1.0,0.0,10.0,2.0,2.0,0.0,72230.0,2912.5297,107.0,56.0,44.0,63997,0.176049,1214.0,1227.0,1167.0,Oh My God


In [181]:
get_league("SL")

Unnamed: 0,teamname,league,opp_name,old_elo,opp_elo,new_elo
19846,Movistar Riders,SL,Los Heretics,1331.0,1260.0,1311.0
19847,Los Heretics,SL,Movistar Riders,1260.0,1331.0,1279.0
19842,Rebels Gaming,SL,UCAM Tokiers,1233.0,1079.0,1242.0
19851,Fnatic TQ,SL,Giants,1259.0,1207.0,1240.0
6503,Cream Real Betis,SL,Giants,1247.0,1205.0,1229.0
19850,Giants,SL,Fnatic TQ,1207.0,1259.0,1225.0
7229,Astralis SB,SL,eMonkeyz,1214.0,988.0,1220.0
19838,Barça eSports,SL,Guasones,1204.0,1055.0,1213.0
14346,G2 Arctic,SL,KOI,1204.0,1208.0,1188.0
19835,Finetwork KOI,SL,BISONS ECLUB,1138.0,1148.0,1154.0


In [182]:
#Input home and away teams with their odds and get back a df with the difference between betting odds and elo odds
home = ['Guasones','BISONS ECLUB','UCAM Tokiers','Barça eSports']
away = ['Finetwork KOI','Rebels Gaming','Giants','Los Heretics']
odds = [(200,-275),(333,-500),(175,-250),(300,-450)]
single = single_game_odds(current_elo,home,away,odds)
series_odds = [(-250,175),(-163,120),(333,-500),(500,-900),(175,-250)]
series = best_of_3_odds(current_elo,home,away,series_odds)

In [183]:
single

Unnamed: 0,teamname,elo,opponent,next_opp_elo,odds,implied_odds,elo_odds,odds_diff
0,Barça eSports,1213.0,Los Heretics,1279.0,300,0.25,0.406145,0.156145
1,BISONS ECLUB,1131.0,Rebels Gaming,1242.0,333,0.230947,0.34548,0.114533
2,Guasones,1045.0,Finetwork KOI,1154.0,200,0.333333,0.348088,0.014755
3,Giants,1225.0,UCAM Tokiers,1069.0,-250,0.714286,0.71054,-0.003746
4,UCAM Tokiers,1069.0,Giants,1225.0,175,0.363636,0.28946,-0.074176
5,Finetwork KOI,1154.0,Guasones,1045.0,-275,0.733333,0.651912,-0.081421
6,Rebels Gaming,1242.0,BISONS ECLUB,1131.0,-500,0.833333,0.65452,-0.178813
7,Los Heretics,1279.0,Barça eSports,1213.0,-450,0.818182,0.593855,-0.224326


In [177]:
series

Unnamed: 0,teamname,elo,opponent,next_opp_elo,odds,implied_odds,elo_odds,series_odds,odds_diff
0,Grypciocraft Esports,1287.0,exeed,1114.0,-163,0.619772,0.730245,0.820956,0.201184
1,Alior Bank Team,1165.0,Team ESCA Gaming,1046.0,-250,0.714286,0.664858,0.738326,0.02404
2,Forsaken,1118.0,Orbit Anonymo,1171.0,175,0.363636,0.424313,0.387337,0.0237
3,Illuminar Gaming,1205.0,Komil&amp;Friends,1022.0,-500,0.833333,0.741434,0.834004,0.000671
4,Zero Tenacity,1387.0,Iron Wolves,1145.0,-900,0.9,0.801081,0.897036,-0.002964
5,Iron Wolves,1145.0,Zero Tenacity,1387.0,500,0.166667,0.198919,0.102964,-0.063702
6,Komil&amp;Friends,1022.0,Illuminar Gaming,1205.0,333,0.230947,0.258566,0.165996,-0.064951
7,Orbit Anonymo,1171.0,Forsaken,1118.0,-250,0.714286,0.575687,0.612663,-0.101622
8,Team ESCA Gaming,1046.0,Alior Bank Team,1165.0,175,0.363636,0.335142,0.261674,-0.101962
9,exeed,1114.0,Grypciocraft Esports,1287.0,120,0.454545,0.269755,0.179044,-0.275502


In [None]:
get_league('LCS')

In [None]:
home =['Cloud9','Team Liquid','Immortals','FlyQuest']
away = ['Golden Guardians','TSM','Dignitas','100 Thieves']
odds = [(-225,162),(-250,175),(110,-150),(-225,162)]
lcs_series = best_of_3_odds(current_elo,home,away,odds)

In [None]:
lcs_series

In [None]:
df[df.teamname=='Rare Atom'][['teamname','opp_name','date','result']].tail(20)


print('sports_book: ', round(win_prob(-110), 2)) #based on betting odds
print('elo_odds: ', round(win_percent(1073, 980), 2)) #based on elo

# Open html file and parse with BeautifulSoup


In [223]:
file_path = 'test.html'  # Specify the path to the HTML file in the local directory

# Read the HTML content from the file
with open(file_path, 'r', encoding='utf-8') as f:
    html = f.read()
    
soup = BeautifulSoup(html,'html.parser')

In [228]:
# Get list of teams
team_html = soup.select('div.ses-ParticipantFixtureDetailsEsports_Team')
team_list = [i.text for i in team_html]

# Get list of odds for each team
span_element = soup.find_all('span', class_='sgl-ParticipantOddsOnly80_Odds')
odds_list = [int(i.text) for i in span_element]

In [231]:
pd.DataFrame(team_list,odds_list)

Unnamed: 0,0
350,Iron Wolves
200,Zero Tenacity
-138,Team ESCA Gaming
100,Alior Bank Team
120,Team ESCA Gaming
...,...
-300,MAD Lions
150,Fnatic
-334,G2 Esports
-400,Team Vitality


# Don't go past here yet

In [None]:
numerical = train.select_dtypes(['int','float']).columns

In [None]:
def create_target(groupby):
    groupby['target']=groupby['result'].shift(-1)
    return groupby

In [None]:
def add_target(df):
    df = df.groupby('teamname').apply(create_target)
    df.loc[pd.isnull(df.target),'target'] =2
    df.target = df.target.astype(int,errors='ignore')
    return df

In [None]:
df = add_target(df)

In [None]:
from sklearn.preprocessing import MinMaxScaler #scale all numerical columns

removed_columns = ['teamname','league','date','target','opp_name']
selected_columns = df.columns[~df.columns.isin(removed_columns)]

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])


In [None]:
# Create rolling averages for columns, concat as new columns to df


def rolling(team):
    rolling = team.rolling(10).mean()
    return rolling

def add_rolling(df):
    cols = ['gamelength','teamkills','teamdeaths','firstblood','dragons','barons','opp_barons','towers','opp_towers',\
       'inhibitors','opp_inhibitors','damagetochampions','damagetakenperminute','wardsplaced','wardskilled',\
       'controlwardsbought','totalgold','gspd']

    df_rolling=df[list(cols)+['teamname']]
    
    
    df_rolling = df_rolling.groupby('teamname',group_keys=False)[cols].apply(rolling)

    rolling_cols = [f'{col}_rolling' for col in df_rolling.columns]
    df_rolling.columns = rolling_cols
    df = pd.concat([df,df_rolling],axis=1)
    return df.dropna()

In [None]:
def next_opp(team):
    team['next_opp'] = team['opp_name'].shift(-1)
    return team
def add_opp(df):
    df = df.groupby('teamname').apply(next_opp)
    df.loc[df.next_opp.isnull(),'next_opp'] = 2
    return df

In [None]:
add_opp(df)

In [None]:
def next_side(team):
    team['next_blue'] = team['blue_side'].shift(-1)
    return team

def add_next_side(df):
    df = df.groupby('teamname').apply(next_side)
    df.loc[df.next_blue.isnull(),'next_blue']=2
    df.next_blue = df.next_blue.astype(int,errors='ignore')
    return df

In [None]:
def next_date(team):
    team['next_date'] = team['date'].shift(-1)
    return team

def add_next_date(df):
    df = df.groupby('teamname').apply(next_date)
    df.loc[df.next_date.isnull(),'next_date']=2
    return df

In [None]:
full = df.merge(df[rolling_cols + ["next_opp", "next_date", "teamname"]], left_on=["teamname", "next_date"], \
                right_on=["next_opp", "next_date"])

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

def create_objects():
    rr = RidgeClassifier(solver ='sag',normalize=False,)
    split = TimeSeriesSplit(n_splits=3)
    sfs = SequentialFeatureSelector(rr, n_features_to_select=14,direction='backward',cv=split,n_jobs=-1)

create_objects()

In [None]:
removed_columns = list(full.columns[full.dtypes=='object']) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [None]:
sfs.fit(full[selected_columns],full['target'])

In [None]:
selectors = selected_columns[sfs.get_support()]

In [None]:
predictions = backtest(full,rr,selectors,'target')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions.actual,predictions.prediction)

.5665 'forward', rr

In [None]:
def near_split(x, num_bins): #Split my df into equal splits to perform backtesting
    quotient, remainder = divmod(x, num_bins)
    bins = [quotient + 1] * remainder + [quotient] * (num_bins - remainder)
    count = 0
    new_list = []
    for b in bins:
        count += b
        new_list.append(count)
    return new_list

splits = near_split(df.shape[0],5)
last_split = splits[4]-splits[3] #Difference between last two values for final 'test' set

In [None]:
def backtest(data,model,predictors,target):
    all_predictions= []
    
    for i in range(0,len(splits)-1):
        train = data.loc[:splits[i]]
        test = data.loc[splits[i]:splits[i]+last_split]
        
        model.fit(train[predictors],train[target])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds,index=test.index)
        combined = pd.concat([test[target],preds],axis=1)
        combined.columns = ['actual','prediction']
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions)
        
        
        
        


In [None]:
predictions = backtest(full,rr,selectors,'target')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions.actual,predictions.prediction)

### optimize ridge regression

In [None]:
from sklearn import decomposition
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

In [None]:
X = full[selectors]
y = full['target']

In [None]:
pca = decomposition.PCA()
ridge = linear_model.Ridge()

In [None]:
pipe = Pipeline(steps=[("pca", pca),
                        ("ridge", ridge)])

In [None]:
n_components = list(range(1,X.shape[1]+1,1))
normalize = [True, False]
solver = ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
parameters = dict(pca__n_components=n_components,
                      ridge__normalize=normalize,
                      ridge__solver=solver)

In [None]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X, y)

In [None]:
clf_GS.best_params_

In [None]:
print("Best Number Of Components:", clf_GS.best_estimator_.get_params()["pca__n_components"])
print(); print(clf_GS.best_estimator_.get_params()["ridge"])

