In [1]:
import pandas as pd
import datetime as dt
import numpy as np

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.options.mode.chained_assignment = None

from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer
from sklearn.utils.validation import check_array

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import tree
from xgboost import XGBClassifier, XGBRegressor
from itertools import chain
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import l1_min_c    


The function cleans the bets data; returns a reliable bettor list.

In [3]:
def find_bookies_to_keep(start_date, end_date, ratio):
    bets = pd.read_csv("data/bets.zip")
    matches = pd.read_csv("data/matches.zip")

    # Converting epoch column to datetime
    matches['timestamp'] = matches['epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))
    bets['timestamp'] = bets['odd_epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))

    matches = matches[(matches['timestamp'] > start_date) &
                      (matches['timestamp'] < end_date) &
                      (matches['league_id'] == 148)]

    matches = matches.dropna(
        subset=['match_status', 'match_hometeam_score',
                'match_awayteam_score'])

    match_ids = list(matches.match_id)
    bets = bets[bets['match_id'].isin(match_ids)]
    bets = bets[bets['value'] > 1]
    bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

    bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                            columns='variable',
                            values='value').reset_index()
    bets = bets[['match_id', 'odd_bookmakers',
                 'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

    final_bets = bets.groupby(['match_id', 'odd_bookmakers'],
                              as_index=False).last()

    bookies = final_bets.groupby('odd_bookmakers').count()[['match_id']].reset_index()
    bookies['total_matches'] = final_bets.match_id.nunique()
    bookies['ratio'] = bookies['match_id'] / bookies['total_matches']
    bookies.sort_values('ratio', ascending=False, inplace=True)
    bookies.reset_index(drop=True, inplace=True)
    bookies_to_keep = bookies[bookies['ratio'] > ratio]
    return list(bookies_to_keep.odd_bookmakers)

In [4]:
def week_converter(timestamp):
    """
    year is 2019 for dates between 2019-07 and 2020-06, 
    22nd week just random splitter, 
    there might be better representation
    
    is_national is True for Friday, Saturday, Sunday, Monday 
    False otherwise
    """
    year, week, day = timestamp.isocalendar()
    season = year - 1 if week < 27 else year
    is_weekend = day >= 5 or day == 1  
    return [timestamp, season, year, week, is_weekend]


For the generation of bet related features, we first selected which bookmakers to use. We chose a starting point, and we looked for the overall odd availability levels for all bookmakers. If a bookmaker announced their odds at least %97.5 of the matches from the England Premiere league, we selected them. And after converting their bets to implied probabilities by normalizing them, we looked for the availability of odds within bookmakers. Most available bet types are 'odd_1', 'odd_x', 'odd_2', 'bts_yes', 'bts_no', 'o+1.5', 'u+1.5', 'o+2.5', 'u+2.5', 'o+3.5', 'u+3.5', 'o+4.5', 'u+4.5' and these are all related directly or indirectly with total goals scored. For all these categories we looked for their minimum, maximum, first, last and mean values (for each bookmakers), bigger implied probability (lower odds) represents higher expectation. So let's if first probability (and hence min probability) is low for a match, and then it increased (hence max and last values are high) this corresponds for something has changed in favor of that team, so might affect the total goals scored. This example is valid for all above bet types. If a match has very high mean probabilty of o+3.5, this might reveal the high probability of having more total scores than usual. And even selecting the most "available" bookmaker and their mostly announced bets, there might be some NA values, we replaced them with the mean values for that same type of bet from the remaining bookmakers. If all bookmakers skip that bet type for a match, we simply remove those matches from our history.

In [5]:
def generate_bet_features(bookies_to_keep, na_ratio=0.15):
    bets = pd.read_csv("data/bets.zip")
    bets = bets[bets["odd_bookmakers"].isin(bookies_to_keep)]

    bets['timestamp'] = bets['odd_epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))

    bets = bets[bets['value'] > 1]
    bet_groups = [['odd_1', 'odd_x', 'odd_2'],
                    ['bts_yes', 'bts_no'],
                    ['o+1.5', 'u+1.5'],
                    ['o+2.5', 'u+2.5'],
                    ['o+3.5', 'u+3.5'],
                    ['o+4.5', 'u+4.5']]

    bets = bets[bets['variable'].isin(list(chain.from_iterable(bet_groups)))]

    bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                            columns='variable',
                            values='value').reset_index()

    for bet_type in bet_groups:
        bets[bet_type] = bets[bet_type].rdiv(1)
        bets[bet_type] = bets[bet_type].div(bets[bet_type].sum(axis=1),
                                            axis=0)

    bets = bets.sort_values(
        ['timestamp', 'match_id', 'odd_bookmakers']).reset_index(drop=True)

    standart_bets = bet_groups.pop(0)
    new_bets = list(chain.from_iterable(bet_groups))
    # the reason i did in this way, we may want to use different stats for
    # odd1x2 types and over under types
    bets_features = bets.groupby(['match_id', 'odd_bookmakers']).agg({
        **{i: ['min', 'max', 'first', 'last', 'mean'] 
            for i in standart_bets},
        **{i: ['min', 'max', 'first', 'last', 'mean'] 
            for i in new_bets}})

    bets_features.columns = bets_features.columns.map('{0[0]}_{0[1]}'.format)
    # bets_features.fillna(
    #     value={i: 0 for i in bets_features.columns if i.endswith('var')},
    #     inplace=True)    
    mean_bets_features = bets_features.groupby('match_id').mean()

    bets_features_pivoted = bets_features.pivot_table(
        index=["match_id"],
        columns= ["odd_bookmakers"])

    bets_features_pivoted.columns = bets_features_pivoted.columns.map('{0[1]}_{0[0]}'.format)

    na_cols = bets_features_pivoted.isnull().sum()
    keep = na_cols < (na_ratio * len(bets_features_pivoted))
    cols_to_keep = keep[keep == True].index.values.tolist()
    bets_features_pivoted = bets_features_pivoted[cols_to_keep]

    for cols in bets_features_pivoted:
        mean_col = '_'.join(cols.split('_')[1:])
        bets_features_pivoted[cols] = bets_features_pivoted[cols].combine_first(
            mean_bets_features[mean_col])

    return bets_features_pivoted.dropna().reset_index()

This function generates features about matches. We are getting the historic and rolling (last 1/5 matches) performances (using mean) for the teams. How many goals on average the team concedes/scores is an important feature. We are also finding performance based on points and the performance difference between the teams.

We are also checking on average how many matches of a team are under 2.5 in different dimensions. We also have features based on home/away. We are looking at the performance of home team when they are playing at home (the features ending with _pos).

exp_goal5 and exp_goal1 might be used as naive predictors. First one looks at the expected goals in a match based on the last 5 match of the teams.

point5diff gives us the performance difference between the teams based on last 5 matches.

We also have clean sheet and scoring rates per team. It is clearly related to scoring. For the project we already worked on win - draw - lose ratios; so we put them on too. It is not directly related to scoring but they might end up being useful.


In [6]:
def generate_match_features():
    matches = pd.read_csv("data/matches.zip")
    matches['timestamp'] = matches['epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))
    matches[['date', 'season', 'year', 'week', 'is_weekend']] = \
        pd.DataFrame(matches.timestamp.apply(week_converter).values.tolist(), 
                        index=matches.index)
    matches = matches.sort_values("date")

    away_side = matches[["match_awayteam_id", "match_awayteam_name", 
                            "match_id", "season", "date", 
                            "match_awayteam_score", "match_hometeam_score"]]
    home_side = matches[["match_hometeam_id", "match_hometeam_name", 
                            "match_id", "season", "date", 
                            "match_hometeam_score", "match_awayteam_score"]]
    away_side["HomeAway"] = "Away"
    home_side["HomeAway"] = "Home"
    away_side.columns = ['team_id', 'team_name', "match_id", 
                            "season", "date", "scored", "conceded", "home_away"]
    home_side.columns = ['team_id', 'team_name', "match_id", 
                            "season", "date", "scored", "conceded", "home_away"]
    team_match = pd.concat([away_side, home_side])
    team_match = team_match.sort_values("date").reset_index(drop=True)

    team_match['point'] = np.nan
    team_match.loc[team_match['scored'] > team_match['conceded'], 'point'] = 3
    team_match.loc[team_match['scored'] == team_match['conceded'], 'point'] = 1
    team_match.loc[team_match['scored'] < team_match['conceded'], 'point'] = 0

    team_match['won'] = 0
    team_match.loc[team_match['point'] == 3, 'won'] = 1
    team_match['draw'] = 0
    team_match.loc[team_match['point'] == 1, 'draw'] = 1
    team_match['lost'] = 0
    team_match.loc[team_match['point'] == 0, 'lost'] = 1
    team_match['clean_sheet'] = 0
    team_match.loc[team_match['conceded'] == 0, 'clean_sheet'] = 1
    team_match['has_scored'] = 0
    team_match.loc[team_match['scored'] > 0, 'has_scored'] = 0
    team_match['over25'] = 0
    team_match.loc[team_match['scored'] + team_match['conceded'] > 2, 'over25'] = 1
    team_match['under25'] = 0
    team_match.loc[team_match['scored'] + team_match['conceded'] < 3, 'under25'] = 1

    roll1 = lambda x: x.rolling(1).mean().shift()
    roll5 = lambda x: x.rolling(5, min_periods = 1).mean().shift()
    historic = lambda x: x.expanding().mean().shift()

    team_match["point1"] = team_match.groupby(
        ["season", "team_id"]).point.apply(roll1).reset_index(0,drop=True)
    team_match["goal_scored1"] = team_match.groupby(
        ["season", "team_id"]).scored.apply(roll1).reset_index(0,drop=True)
    team_match["goal_conceded1"] = team_match.groupby(
        ["season", "team_id"]).conceded.apply(roll1).reset_index(0,drop=True)
    team_match["total_goals1"] = team_match["goal_conceded1"] + team_match["goal_scored1"]

    team_match["point5"] = team_match.groupby(
        ["season", "team_id"]).point.apply(roll5).reset_index(0,drop=True)
    team_match["goal_scored5"] = team_match.groupby(
        ["season", "team_id"]).scored.apply(roll5).reset_index(0,drop=True)
    team_match["goal_conceded5"] = team_match.groupby(
        ["season", "team_id"]).conceded.apply(roll5).reset_index(0,drop=True)
    team_match["clean_sheet5"] = team_match.groupby(
        ["season", "team_id"]).clean_sheet.apply(roll5).reset_index(0,drop=True)
    team_match["over25_ratio5"] = team_match.groupby(
        ["season", "team_id"]).over25.apply(roll5).reset_index(0,drop=True)
    team_match["under25_ratio5"] = team_match.groupby(
        ["season", "team_id"]).under25.apply(roll5).reset_index(0,drop=True)
    team_match["total_goals5"] = team_match["goal_conceded5"] + team_match["goal_scored5"]

    team_match["point1_pos"] = team_match.groupby(
        ["season", "team_id", "home_away"]).point.apply(roll1).reset_index(0,drop=True)
    team_match["goal_scored1_pos"] = team_match.groupby(
        ["season", "team_id", "home_away"]).scored.apply(roll1).reset_index(0,drop=True)
    team_match["goal_conceded1_pos"] = team_match.groupby(
        ["season", "team_id", "home_away"]).conceded.apply(roll1).reset_index(0,drop=True)

    team_match["performance_season"] = team_match.groupby(
        ["season", "team_id"]).point.apply(historic).reset_index(0,drop=True)

    team_match["draw_ratio"] = team_match.groupby(
        ["team_id"]).draw.apply(historic).reset_index(0,drop=True)
    team_match["win_ratio"] = team_match.groupby(
        ["team_id"]).won.apply(historic).reset_index(0,drop=True)
    team_match["lost_ratio"] = team_match.groupby(
        ["team_id"]).lost.apply(historic).reset_index(0,drop=True)
    team_match["over25_ratio"] = team_match.groupby(
        ["season", "team_id"]).over25.apply(historic).reset_index(0,drop=True)
    team_match["under25_ratio"] = team_match.groupby(
        ["season", "team_id"]).under25.apply(historic).reset_index(0,drop=True)

    team_match["draw_ratio_season"] = team_match.groupby(
        ["season", "team_id"]).draw.apply(historic).reset_index(0,drop=True)
    team_match["win_ratio_season"] = team_match.groupby(
        ["season", "team_id"]).won.apply(historic).reset_index(0,drop=True)
    team_match["lost_ratio_season"] = team_match.groupby(
        ["season", "team_id"]).lost.apply(historic).reset_index(0,drop=True)
    team_match["over25_ratio_season"] = team_match.groupby(
        ["season", "team_id"]).over25.apply(historic).reset_index(0,drop=True)
    team_match["under25_ratio_season"] = team_match.groupby(
        ["season", "team_id"]).under25.apply(historic).reset_index(0,drop=True)

    team_match["draw_ratio_pos"] = team_match.groupby(
        ["team_id", "home_away"]).draw.apply(historic).reset_index(0,drop=True)
    team_match["win_ratio_pos"] = team_match.groupby(
        ["team_id", "home_away"]).won.apply(historic).reset_index(0,drop=True)
    team_match["lost_ratio_pos"] = team_match.groupby(
        ["team_id", "home_away"]).lost.apply(historic).reset_index(0,drop=True)

    team_match["draw_ratio_season_pos"] = team_match.groupby(
        ["season", "team_id", "home_away"]).draw.apply(historic).reset_index(0,drop=True)
    team_match["win_ratio_season_pos"] = team_match.groupby(
        ["season", "team_id", "home_away"]).won.apply(historic).reset_index(0,drop=True)
    team_match["lost_ratio_season_pos"] = team_match.groupby(
        ["season", "team_id", "home_away"]).lost.apply(historic).reset_index(0,drop=True)
    team_match["over25_ratio_season_pos"] = team_match.groupby(
        ["season", "team_id"]).over25.apply(historic).reset_index(0,drop=True)
    team_match["under25_ratio_season_pos"] = team_match.groupby(
        ["season", "team_id"]).under25.apply(historic).reset_index(0,drop=True)

    match_id_pos = team_match.columns.get_loc("match_id")
    point1_pos = team_match.columns.get_loc("point1")
    len_cols = len(team_match.columns)
    
    cols = list(range(match_id_pos,match_id_pos+1)) + list(range(point1_pos, len_cols))
    home = team_match[team_match["home_away"] == 'Home'].iloc[:, cols]
    away = team_match[team_match["home_away"] == 'Away'].iloc[:, cols]
    team_stats = home.merge(away, on='match_id', how='inner', suffixes=('_home', '_away'))
    
    team_stats["point5_diff"] = team_stats["point5_home"] - team_stats["point5_away"]
    team_stats["point1_diff"] = team_stats["point1_home"] - team_stats["point1_away"]

    team_stats["performance_season_diff"] = team_stats["performance_season_home"] - team_stats["performance_season_away"]
    team_stats["exp_goal5"] = (team_stats["total_goals5_home"] + team_stats["total_goals5_away"])/2
    team_stats["exp_goal1"] = (team_stats["total_goals1_home"] + team_stats["total_goals1_away"])/2
    
    team_stats = team_stats.dropna()

    return team_stats

In [7]:
def data_prepare(bets_df, matches_df):
    final_df = bets_df.merge(matches_df, on='match_id').dropna()
    matches = pd.read_csv("data/matches.zip")
    common_cols = ["match_id", "match_hometeam_score", "match_awayteam_score"]
    matches = matches[common_cols]

    matches["total_score"] = matches["match_hometeam_score"] + \
        matches["match_awayteam_score"]
    matches["over_under"] = np.nan
    matches.loc[matches.total_score >= 3, "over_under"] = "over"
    matches.loc[matches.total_score < 3, "over_under"] = "under"

    final_df = final_df.merge(matches, on="match_id")
    final_df = final_df.drop(["match_hometeam_score", "match_awayteam_score"], 
                             axis=1)
    return final_df.dropna()


We are splitting train - test as 80/20. Since sequence might be important; we are using the last 20% of the data as the test.

In [8]:
bookies_to_keep = find_bookies_to_keep('2018-01-01', '2019-12-01', 0.975)

bets_df = generate_bet_features(bookies_to_keep)
matches_df = generate_match_features()

final_df = data_prepare(bets_df, matches_df)

matches = pd.read_csv("data/matches.zip")
matches['timestamp'] = matches['epoch'].apply(
    lambda x: dt.datetime.fromtimestamp(x))

final_df = final_df.merge(matches[["match_id", "timestamp"]], on = "match_id")

final_df = final_df.sort_values("timestamp")
all_length = len(final_df)
test_size = int(np.round(len(final_df)/5))

final_df["test"] = 0
final_df.loc[final_df.tail(test_size).index, 'test'] = 1

In [9]:
X = final_df.drop(["match_id", "over_under", "total_score"], axis=1)
y = final_df[["total_score", "test"]]

In [10]:
X_train = X[X["test"] == 0].drop(["test", "timestamp"], axis = 1)
X_test = X[X["test"] == 1].drop(["test", "timestamp"], axis = 1)
y_train_reg = y[y["test"] == 0]["total_score"]
y_test_reg = y[y["test"] == 1]["total_score"]
y_train_class = y_train_reg > 2.5
y_test_class = y_test_reg > 2.5

In [60]:
def rf_classification(X, y):
    le = LabelEncoder()
    y = le.fit_transform(y)
    rfc=RandomForestClassifier(random_state=42, oob_score = True)
    param_grid = { 
        'bootstrap': [True],
        'n_estimators': [500],
        'max_features': ['sqrt', 'log2', 15, 25, 40],
        'min_samples_leaf' : [5]
    }
    kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=7)

    CV_rfc = GridSearchCV(estimator=rfc, 
                        param_grid=param_grid, 
                        cv= kfold, 
                        verbose=True, n_jobs=-1, return_train_score=True)
    CV_rfc.fit(X, y)
    return CV_rfc

def rf_regression(X, y):
    rfc=RandomForestRegressor(random_state=42)
    param_grid = {
        'bootstrap': [True],
        'n_estimators': [500],
        'max_features': ['sqrt', 'log2', 15, 25, 40],
        'min_samples_leaf' : [5],
        'max_depth': [4, 10]
    }

    CV_rfc = GridSearchCV(estimator=rfc, 
                        param_grid=param_grid, 
                        cv= 8, 
                        verbose=True, n_jobs=-1, scoring = "neg_mean_squared_error", return_train_score=True)
    CV_rfc.fit(X, y)
    return CV_rfc

def xgb_classifier(X,y):
    le = LabelEncoder()
    y = le.fit_transform(y)
    xgb = XGBClassifier()
    kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=7)

    param_grid = {'objective':['binary:logistic'],
                  'learning_rate': [0.05, 0.1, 0.2],
                  'max_depth': [2, 4, 6],
                  'min_child_weight': [11],
                  'silent': [1],
                  'subsample': [0.7],
                  'colsample_bytree': [0.7],
                  'n_estimators': [100, 250, 500],
                  'seed': [1337]}
    
    CV_xgb = GridSearchCV(estimator=xgb, 
                        param_grid=param_grid, 
                        cv= kfold, 
                        verbose=True, n_jobs=-1, return_train_score=True)
    CV_xgb.fit(X, y)
    
    return CV_xgb

def xgb_regression(X,y):
    xgb = XGBRegressor()

    param_grid = {'objective':['reg:linear'],
                  'learning_rate': [0.05, 0.1, 0.2],
                  'max_depth': [2, 4, 6],
                  'min_child_weight': [11],
                  'silent': [1],
                  'subsample': [0.7],
                  'colsample_bytree': [0.7],
                  'n_estimators': [100, 250, 500],
                  'seed': [1337]}
    
    CV_xgb = GridSearchCV(estimator=xgb, 
                        param_grid=param_grid, 
                        cv= 8, 
                        verbose=True, n_jobs=-1, scoring = "neg_mean_squared_error", return_train_score=True)
    CV_xgb.fit(X, y)
    
    return CV_xgb

def dt_regression(X,y):
    
    dt = tree.DecisionTreeRegressor()

    parameters = {
              'min_samples_leaf':[5, 10, 15],
              'ccp_alpha': [0.0, 0.2, 0.6]}

    CV_dt = GridSearchCV(dt, parameters, n_jobs=-1, cv = 8,
                       verbose=2, scoring = "neg_mean_squared_error", refit=True)
    
    CV_dt.fit(X, y)

    return CV_dt

def dt_classifier(X,y):
    
    dt = tree.DecisionTreeClassifier()

    parameters = {'criterion': ["gini"],
              'min_samples_leaf':[5, 10, 15],
              'ccp_alpha': [0.0, 0.2, 0.6]}

    CV_dt = GridSearchCV(dt, parameters, n_jobs=-1,
                       verbose=2, refit=True)
    
    CV_dt.fit(X, y)

    return CV_dt

def l1_regression(X, y):
    model = LassoCV(cv=8, random_state=42, n_alphas=6, max_iter = 100000)
    model.fit(X,y)
    return model
    
def l1_classification(X, y):
    kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=7)
    alphas = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 3)
    model = LogisticRegressionCV(Cs=alphas, cv=kfold, random_state=42, 
                                penalty='l1', solver='saga')
    model.fit(X,y)
    return model


For random forest; regression errors looks quite high

In [36]:
print(-cv_rf_r.best_score_)
print(-cv_rf_r.score(X_test, y_test_reg))
print(-cv_rf_r.score(X_train, y_train_reg))

2.729645030373415
2.54051259002056
2.4703788382753524


For decision tree; regression errors looks quite high. It is high even for the training set. This means, there was a problem in converging. Complexity parameter is not high and minimum samples per leaf is at minimum. Despite that training error is quite high. With this setting I would have expected the model to overfit. 

The problem gets more interesting when we check the predictions. All predictions are same; for the train and test data. All leaves have the same prediction, which shouldn't have happened in a normal setting. It looks like decision tree somehow underfit.

In [62]:
cv_dt_r = dt_regression(X_train, y_train_reg)

print(-cv_dt_r.best_score_)
print(cv_dt_r.best_params_)
print(-cv_dt_r.score(X_test, y_test_reg))
print(-cv_dt_r.score(X_train, y_train_reg))

Fitting 8 folds for each of 9 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   14.9s finished


2.8923748625675127
{'ccp_alpha': 0.2, 'min_samples_leaf': 5}
2.669620912840165
2.8894195684955943


In [59]:
cv_dt_r.predict(X_train)

array([2.8137101, 2.8137101, 2.8137101, ..., 2.8137101, 2.8137101,
       2.8137101])

Numbers for random forest is better than the decision tree; all errors are lower. It looks like it started to train a bit but I would say it still looks underfit. All the predictions are higher than 2 (not a good distribution when we think of the data). Cross validation error is higher than test data; I wouldn't expect this to happen in a fit/overfit model.

In [37]:
cv_rf_r = rf_regression(X_train, y_train_reg)

print(-cv_rf_r.best_score_)
print(cv_rf_r.best_params_)
print(-cv_rf_r.score(X_test, y_test_reg))
print(-cv_rf_r.score(X_train, y_train_reg))

2.729645030373415
{'bootstrap': True, 'max_depth': 4, 'max_features': 15, 'min_samples_leaf': 5, 'n_estimators': 500}
2.54051259002056
2.4703788382753524


In [51]:
cv_rf_r.predict(X_test)

array([2.6236167 , 3.04900216, 2.67673295, 3.33700594, 2.4871905 ,
       2.627531  , 3.30118011, 3.46043992, 2.83870432, 3.43406951,
       2.46830138, 2.92345242, 3.33012133, 2.33124417, 2.57195576,
       2.81047456, 2.67285084, 2.71185489, 2.29038249, 2.39967807,
       3.73719706, 2.69967846, 3.71110467, 3.18315196, 2.45733046,
       3.43398062, 3.40782632, 2.90153663, 3.2853965 , 3.23498133,
       2.64657973, 2.3518162 , 3.44869131, 2.55391889, 2.86056233,
       3.24490946, 2.56759826, 2.66106908, 2.42327898, 2.79787657,
       2.71007236, 2.69105358, 2.71520926, 2.16767988, 3.13653252,
       2.72400601, 2.1162308 , 3.55102504, 2.73865866, 2.38814098,
       3.03203635, 3.84053111, 3.50753503, 3.10583663, 3.30191883,
       2.40058203, 2.39200124, 2.54515355, 3.15657082, 3.19410253,
       3.36409671, 3.46812834, 3.50573284, 3.10554231, 2.60468967,
       2.69965448, 2.56460014, 2.65831119, 2.89996896, 3.1385708 ,
       2.72998316, 2.44525282, 3.35093077, 3.0177742 , 3.87377

In [52]:
cv_xgb_r = xgb_regression(X_train, y_train_reg)

print(-cv_xgb_r.best_score_)
print(cv_xgb_r.best_params_)
print(-cv_xgb_r.score(X_test, y_test_reg))
print(-cv_xgb_r.score(X_train, y_train_reg))

Fitting 8 folds for each of 27 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 21.9min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


2.746662934231389
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 2, 'min_child_weight': 11, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 1337, 'silent': 1, 'subsample': 0.7}
2.5553155590548404
2.388268514839198


In [53]:
cv_xgb_r.predict(X_test)

array([2.5863595, 2.671107 , 2.5874593, 3.6336474, 2.4392443, 2.7245402,
       3.315366 , 3.587971 , 2.7535985, 3.375648 , 2.5849364, 2.906398 ,
       3.233523 , 2.4271345, 2.6097345, 2.637575 , 2.4807553, 2.4867806,
       2.2416892, 2.3799667, 3.7591326, 2.6010094, 3.906064 , 3.1835735,
       2.5005357, 3.3862474, 3.6289053, 2.3334913, 3.1674533, 3.1394184,
       2.6507397, 2.335025 , 3.3594854, 2.5839431, 2.9311967, 3.304246 ,
       2.436667 , 2.692973 , 2.4495168, 2.751389 , 2.803674 , 2.698357 ,
       2.5300484, 2.1825762, 3.3365173, 2.7302969, 2.030344 , 3.7680144,
       2.8403285, 2.5413132, 2.6826813, 3.6968384, 3.5168536, 2.5392113,
       3.182745 , 2.3487134, 2.493506 , 2.4050527, 2.945298 , 3.218545 ,
       3.2859282, 3.383977 , 3.597432 , 3.06544  , 2.5727732, 2.703249 ,
       2.563501 , 2.5305543, 3.0632694, 3.129874 , 2.8473687, 2.4729965,
       3.1969526, 2.8859558, 4.063395 , 2.7507865, 3.310447 , 2.3463278,
       2.7070758, 2.589302 , 2.8326712, 2.3953757, 

In [44]:
#cv_ls = l1_regression(X_train, y_train_reg)

cv_ls.score(X_test, y_test_reg)
cv_ls.score(X_train, y_train_reg)


0.022025570613825485

In [56]:
pred = cv_ls.predict(X_test)