In [135]:
import pandas as pd
import numpy as np
import os
import requests

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [136]:
# Downloads the updated csv for the 2023 season
import gdown

def update_df():
    '''
    Download most recent csv file, concat with rest of data, return the full df
    '''
    output = "lol_2023.csv" #What to save the downloaded file as
    id = "1XXk2LO0CsNADBB1LRGOV5rUpyZdEZ8s2" #The id from the google drive file
    gdown.download(id=id, output=output, quiet=False)
    
    df_2021 = pd.read_csv('lol_2021.csv')
    df_2022 = pd.read_csv('lol_2022.csv')
    df_2023 = pd.read_csv('lol_2023.csv')
    df = pd.concat([df_2021,df_2022,df_2023])
    return df

def get_wiki():
    '''
    Returns chart from wikipedia containing info for Tier 1 and Tier 2 leagues
    Returns tier1, tier2
    '''
    wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_League_of_Legends_leagues_and_tournaments')
    return wiki[1], wiki[3]

def add_opp_name(df): #tup = list of tuples
    # Create an 'opp_name' column for each row
    evens = range(0,df.shape[0],2)
    odds = range(1,df.shape[0],2)
    tup = [(a,b) for a,b in zip(evens,odds)] # list of tuples
    
    for t in tup: #iterate through list of tuples
        a,b= t #unpack each tuple into two values
        df.loc[a,'opp_name']=df.teamname.loc[b] #create new column w/opp_name
        df.loc[b,'opp_name']=df.teamname.loc[a]
    return df

def win_percent(elo_a,elo_b): #Calculate odds to win based off of elo rankings
    return 1/(1+10**((elo_b-elo_a)/400)) #elo_a is who you're calculating for, elo_b is opponent

def win_prob(x): # x is the American odds (-110,110,etc.) Calculates probability of winning
    if x < 0 :
        x = x*-1
        return x / (x + 100)
    else: 
        return 100 / (x + 100)

def gain_elo(elo,opp_elo,k=32): #Gain elo after a win, k=24, expected = 1
    return int(elo+k*(1-win_percent(elo,opp_elo)))

def lose_elo(elo,opp_elo,k=32): #Lose elo after a loss, k=24, expected = 0
    return int(elo+k*(0-win_percent(elo,opp_elo)))

def tie_elo(elo,opp_elo,k=32): #Lose elo after a loss, k=24, expected = .5
    return int(elo+k*(.5-win_percent(elo,opp_elo)))


def wrangle_df(df):
    leagues = ['LCK','LPL','LEC','LCS','PCS','VCS','CBLOL','LJL','LLA'] # These are my 9 tier 1 leagues that I'll keep
    
    df = df[df.league.isin(leagues)] #Filter out non Tier-1 leagues
    df = df[df.position=='team'] #Remove individual player stats
    
    df.split = df.split.str.replace('Split 1','Spring').str.replace('Split 2','Summer') #Rename 'split' names
    df.split = df.split.str.replace('Opening','Spring').str.replace('Closing','Summer')

    cols = ['teamname','league','split','date', 'side', 'gamelength','game', 'result', 'teamkills', 
            'teamdeaths', 'firstblood', 'position', 'dragons', 'barons', 'opp_barons','towers', 'opp_towers', 
            'inhibitors', 'opp_inhibitors', 'damagetochampions', 'damagetakenperminute', 'wardsplaced', 'wardskilled', 
            'controlwardsbought', 'totalgold', 'gspd'] #Columns to keep

    df = df[cols] #Remove unwanted columns
    df = df.dropna() #Drop nan values
    
    df.date = pd.to_datetime(df.date,infer_datetime_format=True) #Change to datetime object
    del df['position'] # Delete 'position' column
    df = df.sort_values('date') #Sort by date
    df = df.reset_index(drop=True) #Reset index
    df.side = np.where(df.side=='Blue',1,0) #Add 'side' column for 'blue' or 'red'
    df.rename(columns={'side':'blue_side'},inplace = True) #Change 'side' to 'blue_side'
    
    df['old_elo']=np.NaN #create new elo column
    df['new_elo']=np.NaN
    df['opp_elo']=np.NaN
    df.loc[df[~df.teamname.duplicated()].index,'old_elo']=1200 #set elo for first game to 1200 for each team

    df = add_opp_name(df) #adds opponents' name
    
    for i in range(0,df.shape[0]):
        opp_name = df.loc[i,'opp_name']
        df.loc[i,'opp_elo'] = df[(df.teamname==opp_name)&(~df.old_elo.isna())]['old_elo'].iloc[-1]
        if df.loc[i,'result'] == 1:
            df.loc[i,'new_elo'] = gain_elo(df.loc[i,'old_elo'],df.loc[i,'opp_elo'])
        else:
            df.loc[i,'new_elo'] = lose_elo(df.loc[i,'old_elo'],df.loc[i,'opp_elo'])

        team_name = df.loc[i,'teamname']
        try:
            next_game_index = df[(df.teamname==team_name)&df.old_elo.isna()]['old_elo'].index[0]
            df.loc[next_game_index,'old_elo'] = df[(df.teamname==team_name)&(~df.new_elo.isna())]['new_elo'].iloc[-1]
        except:
            pass
    
    df.to_csv('final.csv') #Save to csv file
    
    return df

    

# Update and wrangle professional games for Spring and Summer splits

In [137]:
df = update_df()
df.to_csv('raw.csv')
df = wrangle_df(df)

Downloading...
From: https://drive.google.com/uc?id=1XXk2LO0CsNADBB1LRGOV5rUpyZdEZ8s2
To: /Users/thegootch/codeup-data-science/league/lol_2023.csv
100%|██████████████████████████████████████| 34.0M/34.0M [00:00<00:00, 35.7MB/s]
  df = update_df()


In [138]:
df.tail()

Unnamed: 0,teamname,league,split,date,blue_side,gamelength,game,result,teamkills,teamdeaths,firstblood,dragons,barons,opp_barons,towers,opp_towers,inhibitors,opp_inhibitors,damagetochampions,damagetakenperminute,wardsplaced,wardskilled,controlwardsbought,totalgold,gspd,old_elo,new_elo,opp_elo,opp_name
14823,Team BDS,LEC,Spring,2023-04-23 18:05:35,0,1761,3.0,0,5,17,1.0,2.0,0.0,1.0,3.0,9.0,0.0,1.0,46484.0,2835.0937,83.0,36.0,28.0,46212,-0.166147,1280.0,1261.0,1222.0,MAD Lions
14824,Team BDS,LEC,Spring,2023-04-23 18:55:24,1,1431,4.0,0,5,15,0.0,0.0,0.0,1.0,1.0,11.0,0.0,4.0,24920.0,2670.3983,68.0,24.0,21.0,37484,-0.283815,1261.0,1243.0,1222.0,MAD Lions
14825,MAD Lions,LEC,Spring,2023-04-23 18:55:24,0,1431,4.0,1,15,5,1.0,3.0,1.0,0.0,11.0,1.0,4.0,0.0,48158.0,2158.4486,73.0,30.0,20.0,51477,0.283815,1222.0,1238.0,1243.0,Team BDS
14826,Team BDS,LEC,Spring,2023-04-23 19:40:17,1,1683,5.0,0,0,14,0.0,1.0,0.0,1.0,1.0,11.0,0.0,3.0,34005.0,2773.6898,76.0,29.0,31.0,42204,-0.240636,1243.0,1226.0,1238.0,MAD Lions
14827,MAD Lions,LEC,Spring,2023-04-23 19:40:17,0,1683,5.0,1,14,0,1.0,3.0,1.0,0.0,11.0,1.0,3.0,0.0,56143.0,2569.1622,80.0,39.0,36.0,55917,0.240636,1238.0,1254.0,1243.0,Team BDS


# Sports Betting Section

In [139]:
series_3(.6)

0.6479999999999999

In [143]:
df.head()

Unnamed: 0,teamname,league,split,date,blue_side,gamelength,game,result,teamkills,teamdeaths,firstblood,dragons,barons,opp_barons,towers,opp_towers,inhibitors,opp_inhibitors,damagetochampions,damagetakenperminute,wardsplaced,wardskilled,controlwardsbought,totalgold,gspd,old_elo,new_elo,opp_elo,opp_name
0,Top Esports,LPL,Spring,2021-01-09 09:22:04,1,2050,1.0,0,3,9,1.0,2.0,0.0,1.0,3.0,6.0,0.0,1.0,63315.0,2559.0146,101.0,83.0,40.0,54617,-0.000467,1200.0,1184.0,1200.0,Suning
1,Suning,LPL,Spring,2021-01-09 09:22:04,0,2050,1.0,1,9,3,0.0,2.0,1.0,0.0,6.0,3.0,1.0,0.0,58406.0,2940.439,161.0,61.0,46.0,60520,0.000467,1200.0,1215.0,1184.0,Top Esports
2,Top Esports,LPL,Spring,2021-01-09 10:16:20,1,2241,2.0,0,16,27,0.0,3.0,2.0,0.0,6.0,7.0,0.0,2.0,84516.0,3790.415,121.0,73.0,42.0,69692,-0.015474,1184.0,1169.0,1215.0,Suning
3,Suning,LPL,Spring,2021-01-09 10:16:20,0,2241,2.0,1,27,17,1.0,2.0,0.0,2.0,7.0,6.0,2.0,0.0,97504.0,3192.6372,155.0,55.0,49.0,70269,0.015474,1215.0,1228.0,1169.0,Top Esports
4,Oh My God,LPL,Spring,2021-01-09 11:30:25,1,1887,1.0,0,4,15,1.0,1.0,0.0,1.0,3.0,9.0,0.0,1.0,37351.0,2777.9332,104.0,47.0,32.0,51145,-0.028532,1200.0,1184.0,1200.0,EDward Gaming


In [146]:
def get_league(team): #Returns a league ("LCS,LPL,etc.") sorted by latest elo
    return current_elo[current_elo.league==team].sort_values('new_elo',ascending=False)

def get_team(df, team):
    return df[df.teamname==team].sort_values(by='date',ascending = False)

def single_game_odds(df, teams, opponents, bet_odds):
    """
    Returns a DataFrame with 'home' and 'away' teams with elos and implied odds.
    """
    team_list = []
    for team, opponent, odds in zip(teams, opponents, bet_odds):
        row1 = df[df.teamname==team][['teamname', 'new_elo']]
        row1['odds'] = odds[0]
        row2 = df[df.teamname==opponent][['teamname', 'new_elo']]
        row2['odds'] = odds[1]
        row3 = pd.concat([row1, row2])
        row3.columns = ['teamname', 'elo', 'odds']
        row3['opponent'] = [row2.teamname.iloc[0], row1.teamname.iloc[0]]
        row3['next_opp_elo'] = [row2.new_elo.iloc[0], row1.new_elo.iloc[0]]
        team_list.append(row3)
    temp = pd.concat(team_list)
    temp = temp[['teamname', 'elo', 'opponent', 'next_opp_elo', 'odds']]
    temp['implied_odds'] = temp.odds.apply(win_prob)
    temp['elo_odds'] = temp.apply(lambda row: win_percent(row['elo'], row['next_opp_elo']), axis=1)
    temp['odds_diff'] = temp.elo_odds - temp.implied_odds
    temp.sort_values('odds_diff', ascending=False, inplace=True)
    return temp.reset_index(drop=True)

import math

def series_3(probability):
    num_wins_required = 2
    num_games_required = (num_wins_required * 2) - 1

    # Calculate the probability of winning a single game
    p_win = probability

    # Calculate the probability of losing a single game
    p_loss = 1 - p_win

    # Calculate the odds of winning a best-of-3 series
    odds = 0

    for wins in range(num_wins_required, num_games_required + 1):
        # Calculate the number of combinations to achieve the current number of wins
        combinations = math.comb(num_games_required, wins)

        # Calculate the probability of achieving the current number of wins
        p_current_wins = p_win ** wins * p_loss ** (num_games_required - wins)

        odds += combinations * p_current_wins

    return odds


def series_5(probability):
    num_wins_required = 3
    num_games_required = (num_wins_required * 2) - 1

    # Calculate the probability of winning a single game
    p_win = probability

    # Calculate the probability of losing a single game
    p_loss = 1 - p_win

    # Calculate the odds of winning a best-of-5 series
    odds = 0

    for wins in range(num_wins_required, num_games_required + 1):
        # Calculate the number of combinations to achieve the current number of wins
        combinations = math.comb(num_games_required, wins)

        # Calculate the probability of achieving the current number of wins
        p_current_wins = p_win ** wins * p_loss ** (num_games_required - wins)

        odds += combinations * p_current_wins

    return odds


def best_of_3_odds(df, teams, opponents, bet_odds):
    """
    Returns a DataFrame with 'home' and 'away' teams with elos and implied odds.
    """
    team_list = []
    for team, opponent, odds in zip(teams, opponents, bet_odds):
        row1 = df[df.teamname==team][['teamname', 'new_elo']]
        row1['odds'] = odds[0]
        row2 = df[df.teamname==opponent][['teamname', 'new_elo']]
        row2['odds'] = odds[1]
        row3 = pd.concat([row1, row2])
        row3.columns = ['teamname', 'elo', 'odds']
        row3['opponent'] = [row2.teamname.iloc[0], row1.teamname.iloc[0]]
        row3['next_opp_elo'] = [row2.new_elo.iloc[0], row1.new_elo.iloc[0]]
        team_list.append(row3)
    temp = pd.concat(team_list)
    temp = temp[['teamname', 'elo', 'opponent', 'next_opp_elo', 'odds']]
    temp['implied_odds'] = temp.odds.apply(win_prob)
    temp['elo_odds'] = temp.apply(lambda row: win_percent(row['elo'], row['next_opp_elo']), axis=1)
    temp['series_odds'] = temp.apply(lambda row: series_3(row['elo_odds']), axis=1)
    temp['odds_diff'] = temp.series_odds - temp.implied_odds
    temp.sort_values('odds_diff', ascending=False, inplace=True)
    return temp.reset_index(drop=True)

In [141]:
#Read in df and create 'current_elo' df
df = pd.read_csv('final.csv',index_col=0) 
raw = pd.read_csv('raw.csv',index_col=0)

#'current_elo' contains every team and their latest elo
current_elo = df[~df.teamname.duplicated(keep='last')]\
[['teamname','league','opp_name','old_elo','opp_elo','new_elo']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [147]:
get_team(df,"Anyone's Legend")

Unnamed: 0,teamname,league,split,date,blue_side,gamelength,game,result,teamkills,teamdeaths,firstblood,dragons,barons,opp_barons,towers,opp_towers,inhibitors,opp_inhibitors,damagetochampions,damagetakenperminute,wardsplaced,wardskilled,controlwardsbought,totalgold,gspd,old_elo,new_elo,opp_elo,opp_name
14237,Anyone's Legend,LPL,Spring,2023-03-26 10:18:01,0,2522,2.0,0,14,21,1.0,2.0,1.0,2.0,4.0,10.0,0.0,3.0,85073.0,3354.3854,176.0,124.0,72.0,74395,-0.006745,1014.0,1001.0,1102.0,ThunderTalk Gaming
14228,Anyone's Legend,LPL,Spring,2023-03-26 09:14:17,1,2604,1.0,0,11,28,1.0,2.0,1.0,2.0,4.0,10.0,0.0,2.0,91113.0,3235.3687,148.0,98.0,53.0,75145,-0.061405,1028.0,1014.0,1077.0,ThunderTalk Gaming
13817,Anyone's Legend,LPL,Spring,2023-03-17 10:19:47,0,1827,2.0,1,16,7,1.0,3.0,0.0,1.0,6.0,6.0,1.0,0.0,66936.0,2836.1576,108.0,71.0,42.0,56015,-0.010890,1013.0,1028.0,998.0,FunPlus Phoenix
13810,Anyone's Legend,LPL,Spring,2023-03-17 09:22:01,1,2269,1.0,1,24,12,0.0,3.0,1.0,1.0,9.0,3.0,1.0,0.0,106179.0,2379.8766,157.0,66.0,58.0,73620,0.082333,996.0,1013.0,1032.0,FunPlus Phoenix
13745,Anyone's Legend,LPL,Spring,2023-03-15 12:57:40,0,1632,3.0,0,7,24,0.0,1.0,0.0,1.0,1.0,10.0,0.0,2.0,53504.0,3345.7721,99.0,34.0,33.0,43424,-0.221632,1003.0,996.0,1242.0,LNG Esports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5825,Anyone's Legend,LPL,Spring,2022-01-14 10:16:19,0,2352,2.0,0,24,25,1.0,4.0,1.0,1.0,3.0,7.0,0.0,1.0,91459.0,3919.1327,145.0,63.0,58.0,71221,0.021907,1183.0,1166.0,1164.0,Oh My God
5820,Anyone's Legend,LPL,Spring,2022-01-14 09:23:25,1,1929,1.0,0,11,23,0.0,1.0,0.0,1.0,2.0,8.0,0.0,1.0,51211.0,2733.7481,88.0,54.0,41.0,52338,-0.124754,1203.0,1183.0,1129.0,Oh My God
5777,Anyone's Legend,LPL,Spring,2022-01-11 13:08:29,0,1846,3.0,1,18,8,1.0,4.0,1.0,0.0,7.0,4.0,1.0,0.0,50508.0,2593.5536,81.0,51.0,38.0,59331,-0.014797,1192.0,1203.0,1089.0,LGD Gaming
5775,Anyone's Legend,LPL,Spring,2022-01-11 12:19:37,1,1712,2.0,1,15,5,1.0,2.0,1.0,0.0,8.0,2.0,1.0,0.0,43662.0,2285.8879,84.0,38.0,35.0,54385,0.041760,1180.0,1192.0,1101.0,LGD Gaming


In [142]:
get_league("LPL")

Unnamed: 0,teamname,league,opp_name,old_elo,opp_elo,new_elo
14728,JD Gaming,LPL,Bilibili Gaming,1378.0,1260.0,1388.0
11294,Victory Five,LPL,LNG Esports,1278.0,1199.0,1258.0
14729,Bilibili Gaming,LPL,JD Gaming,1260.0,1378.0,1249.0
14701,EDward Gaming,LPL,Bilibili Gaming,1249.0,1258.0,1233.0
5022,Suning,LPL,LNG Esports,1239.0,1180.0,1220.0
14489,LNG Esports,LPL,Oh My God,1210.0,1222.0,1194.0
14592,Oh My God,LPL,Bilibili Gaming,1204.0,1231.0,1189.0
14481,Weibo Gaming,LPL,Bilibili Gaming,1174.0,1239.0,1160.0
14458,Top Esports,LPL,Oh My God,1168.0,1177.0,1152.0
14407,Royal Never Give Up,LPL,Bilibili Gaming,1117.0,1196.0,1104.0


In [None]:
#Input home and away teams with their odds and get back a df with the difference between betting odds and elo odds
home = ['Ultra Prime','ThunderTalk Gaming','LGD Gaming','Ninjas in Pyjamas','Royal Never Give Up','LNG Esports']
away = ["Anyone's Legend",'FunPlus Phoenix','Invictus Gaming','EDward Gaming','Rare Atom','Top Esports']
odds = [(-120,-120),(-200,150),(150,-200),(275,-400),(-163,120),(-163,120)]
single = single_game_odds(current_elo,home,away,odds)
odds = [(-120,-120),(-275,200),(200,-275),(450,-800),(-188,137),(-188,137)]
series = best_of_3_odds(current_elo,home,away,odds)

In [None]:
single

In [None]:
series

In [None]:
get_league('LCS')

In [None]:
home =['Cloud9','Team Liquid','Immortals','FlyQuest']
away = ['Golden Guardians','TSM','Dignitas','100 Thieves']
odds = [(-225,162),(-250,175),(110,-150),(-225,162)]
lcs_series = best_of_3_odds(current_elo,home,away,odds)

In [None]:
lcs_series

In [None]:
df[df.teamname=='Rare Atom'][['teamname','opp_name','date','result']].tail(20)


print('sports_book: ', round(win_prob(-110), 2)) #based on betting odds
print('elo_odds: ', round(win_percent(1073, 980), 2)) #based on elo

# API Test


In [134]:
import asyncio
from playwright.async_api import async_playwright


async def download_html_with_playwright():
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch()
        context = await browser.new_context()
        page = await context.new_page()

        # Navigate to the website
        await page.goto("https://www.co.bet365.com/")

        # Wait for the page to load completely
        await page.wait_for_load_state("networkidle")

        # Get the HTML content of the page
        html = await page.content()

        # Save the HTML to a file
        with open("bet365.html", "w", encoding="utf-8") as file:
            file.write(html)

        # Close the browser
        await context.close()
        await browser.close()

    print("HTML downloaded successfully!")


await download_html_with_playwright()

HTML downloaded successfully!


# Don't go past here yet

In [None]:
numerical = train.select_dtypes(['int','float']).columns

In [None]:
def create_target(groupby):
    groupby['target']=groupby['result'].shift(-1)
    return groupby

In [None]:
def add_target(df):
    df = df.groupby('teamname').apply(create_target)
    df.loc[pd.isnull(df.target),'target'] =2
    df.target = df.target.astype(int,errors='ignore')
    return df

In [None]:
df = add_target(df)

In [None]:
from sklearn.preprocessing import MinMaxScaler #scale all numerical columns

removed_columns = ['teamname','league','date','target','opp_name']
selected_columns = df.columns[~df.columns.isin(removed_columns)]

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])


In [None]:
# Create rolling averages for columns, concat as new columns to df


def rolling(team):
    rolling = team.rolling(10).mean()
    return rolling

def add_rolling(df):
    cols = ['gamelength','teamkills','teamdeaths','firstblood','dragons','barons','opp_barons','towers','opp_towers',\
       'inhibitors','opp_inhibitors','damagetochampions','damagetakenperminute','wardsplaced','wardskilled',\
       'controlwardsbought','totalgold','gspd']

    df_rolling=df[list(cols)+['teamname']]
    
    
    df_rolling = df_rolling.groupby('teamname',group_keys=False)[cols].apply(rolling)

    rolling_cols = [f'{col}_rolling' for col in df_rolling.columns]
    df_rolling.columns = rolling_cols
    df = pd.concat([df,df_rolling],axis=1)
    return df.dropna()

In [None]:
def next_opp(team):
    team['next_opp'] = team['opp_name'].shift(-1)
    return team
def add_opp(df):
    df = df.groupby('teamname').apply(next_opp)
    df.loc[df.next_opp.isnull(),'next_opp'] = 2
    return df

In [None]:
add_opp(df)

In [None]:
def next_side(team):
    team['next_blue'] = team['blue_side'].shift(-1)
    return team

def add_next_side(df):
    df = df.groupby('teamname').apply(next_side)
    df.loc[df.next_blue.isnull(),'next_blue']=2
    df.next_blue = df.next_blue.astype(int,errors='ignore')
    return df

In [None]:
def next_date(team):
    team['next_date'] = team['date'].shift(-1)
    return team

def add_next_date(df):
    df = df.groupby('teamname').apply(next_date)
    df.loc[df.next_date.isnull(),'next_date']=2
    return df

In [None]:
full = df.merge(df[rolling_cols + ["next_opp", "next_date", "teamname"]], left_on=["teamname", "next_date"], \
                right_on=["next_opp", "next_date"])

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

def create_objects():
    rr = RidgeClassifier(solver ='sag',normalize=False,)
    split = TimeSeriesSplit(n_splits=3)
    sfs = SequentialFeatureSelector(rr, n_features_to_select=14,direction='backward',cv=split,n_jobs=-1)

create_objects()

In [None]:
removed_columns = list(full.columns[full.dtypes=='object']) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [None]:
sfs.fit(full[selected_columns],full['target'])

In [None]:
selectors = selected_columns[sfs.get_support()]

In [None]:
predictions = backtest(full,rr,selectors,'target')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions.actual,predictions.prediction)

.5665 'forward', rr

In [None]:
def near_split(x, num_bins): #Split my df into equal splits to perform backtesting
    quotient, remainder = divmod(x, num_bins)
    bins = [quotient + 1] * remainder + [quotient] * (num_bins - remainder)
    count = 0
    new_list = []
    for b in bins:
        count += b
        new_list.append(count)
    return new_list

splits = near_split(df.shape[0],5)
last_split = splits[4]-splits[3] #Difference between last two values for final 'test' set

In [None]:
def backtest(data,model,predictors,target):
    all_predictions= []
    
    for i in range(0,len(splits)-1):
        train = data.loc[:splits[i]]
        test = data.loc[splits[i]:splits[i]+last_split]
        
        model.fit(train[predictors],train[target])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds,index=test.index)
        combined = pd.concat([test[target],preds],axis=1)
        combined.columns = ['actual','prediction']
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions)
        
        
        
        


In [None]:
predictions = backtest(full,rr,selectors,'target')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions.actual,predictions.prediction)

### optimize ridge regression

In [None]:
from sklearn import decomposition
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

In [None]:
X = full[selectors]
y = full['target']

In [None]:
pca = decomposition.PCA()
ridge = linear_model.Ridge()

In [None]:
pipe = Pipeline(steps=[("pca", pca),
                        ("ridge", ridge)])

In [None]:
n_components = list(range(1,X.shape[1]+1,1))
normalize = [True, False]
solver = ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
parameters = dict(pca__n_components=n_components,
                      ridge__normalize=normalize,
                      ridge__solver=solver)

In [None]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X, y)

In [None]:
clf_GS.best_params_

In [None]:
print("Best Number Of Components:", clf_GS.best_estimator_.get_params()["pca__n_components"])
print(); print(clf_GS.best_estimator_.get_params()["ridge"])

