## Настройка среды

In [1]:
!pip install optuna
!pip install catboost



In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostRegressor
from tqdm.notebook import tqdm
from IPython.display import clear_output
import xgboost as xgb
import pickle

plt.rcParams.update({'font.size': 16})
sns.set_style('whitegrid')
np.random.seed(0xFA1AFE1)

In [106]:
dataset = pd.read_excel('/kaggle/input/coursework/final_dataset.xlsx')
upgraded_data = pd.read_csv('/kaggle/input/coursework/upgraded_data.csv')

## Обучение модели

In [107]:
numeric_features = ['age', 'height', 'player_agent', 'outfitter', 'appearances',
                    'goals', 'assists', 'national_matches', 'avg_injury_duration',
                    'foot_left', 'foot_right', 'trophies_average_score',
                    'club_price', 'followers', 'joined', 'substitutions_on',
                    'substitutions_off', 'yellow_cards', 'red_cards', 'penalty_goals',
                    'minutes_played', 'individual', 'international', 'national',
                    'other', 'national_status', 'national_goals', 'national_assists',
                    'national_yellow_cards', 'national_red_cards',
                    'avg_injuries_per_season',
                    'is_injured', 'last_injury_date', 'club_statistics_matches',
                    'club_statistics_goals', 'trophies_total_score',
                    'club_statistics_pts', 'club_league_top_rank',
                    'club_league_lowest_rank', 'club_league_mean_rank',
                    'trophies_max_score', 'trophies_min_score', 'trophies_amount',
                    'club_trophies_total_score', 'club_trophies_average_score',
                    'club_trophies_max_score', 'club_trophies_min_score',
                    'club_trophies_amount', 'Defender_Group', 'Midfielder_Group',
                    'Striker_Group', 'Winger_Group']

categorical_features = []

In [108]:
X = dataset[numeric_features]
y = dataset['current_price'] / 1e6
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ('scaling', StandardScaler(), numeric_features)
])

pipeline = Pipeline([('ohe_scaling', column_transformer), ('regression', Lasso())])

model = pipeline.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
print("Linear regression train MAE = %.4f" % mean_absolute_error(y_train, y_train_pred))
print("Linear regression validation MAE = %.4f" % mean_absolute_error(y_val, y_val_pred))

new_X = dataset[dataset['current_price'] > 1e6][numeric_features]
new_y = dataset[dataset['current_price'] > 1e6]['current_price'] / 1e6

new_y_pred = model.predict(new_X)
print("Linear regression best players MAE = %.4f" % mean_absolute_error(new_y, new_y_pred))

Linear regression train MAE = 1.7560
Linear regression validation MAE = 1.8629
Linear regression best players MAE = 5.2360


In [110]:
def objective(trial):
    params = {
        'depth' :  trial.suggest_int('depth', 1, 16),
        'iterations' : trial.suggest_int('iterations', 15, 100),
        'learning_rate' : trial.suggest_float("learning_rate", 1e-4, 1e-1),
        'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-1, 1e1),
        'grow_policy' : trial.suggest_categorical('grow_policy', ["SymmetricTree", "Depthwise", "Lossguide"]),
        'min_data_in_leaf' : trial.suggest_int("min_data_in_leaf ", 3, 50),
        'max_leaves' : trial.suggest_int("max_leaves", 100, 5000),
        'nan_mode' : trial.suggest_categorical('nan_mode', ["Forbidden", "Min", "Max"]),
        'score_function' : trial.suggest_categorical('score_function', ["Cosine", "L2"]),

    }


    cat = CatBoostRegressor(depth=params['depth'],
                            iterations=params['iterations'],
                            learning_rate=params['learning_rate'],
                            l2_leaf_reg=params['l2_leaf_reg'],
                            grow_policy=params['grow_policy'],
                            min_data_in_leaf=params['min_data_in_leaf'] if params['grow_policy'] != 'Lossguide' else None,
                            max_leaves=params['max_leaves'] if params['grow_policy'] == 'Lossguide' else None,
                            nan_mode=params['nan_mode'],
                            score_function=params['score_function'])



    cat.fit(X_train,y_train,verbose=False)
    preds = cat.predict(X_val)
    return mean_squared_error(y_val, preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study.best_params

[I 2024-05-11 12:26:15,445] A new study created in memory with name: no-name-bb29b546-b2ee-4f30-a975-292984fc41ec
[W 2024-05-11 12:26:21,161] Trial 0 failed with parameters: {'depth': 16, 'iterations': 65, 'learning_rate': 0.011866273252047017, 'l2_leaf_reg': 2.9802015062019183, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf ': 15, 'max_leaves': 4344, 'nan_mode': 'Min', 'score_function': 'L2'} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_33/2432005338.py", line 28, in objective
    cat.fit(X_train,y_train,verbose=False)
  File "/opt/conda/lib/python3.10/site-packages/catboost/core.py", line 5807, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "/opt/conda/lib/python3.10/site-packages/catbo

KeyboardInterrupt: 

In [111]:
first_cat = CatBoostRegressor(depth=7,
                        iterations=68,
                        learning_rate=0.09138471633404736,
                        l2_leaf_reg=8.527576075050542,
                        grow_policy='SymmetricTree',
                        min_data_in_leaf=10,
                        nan_mode='Forbidden',
                        score_function='L2')



first_cat.fit(X_train,y_train,verbose=False)
preds = first_cat.predict(X)
dataset['predicted_price'] = preds
print("Gradient boosting validation MAE = %.4f" % mean_absolute_error(first_cat.predict(X_val), y_val))

Gradient boosting validation MAE = 1.1328


## Обучение модели с продвинутыми статистиками

In [112]:
upgraded_data = upgraded_data.rename(columns={'Unnamed: 0': 'link'})
upgraded_data = upgraded_data.merge(dataset, on='link')
best_players = upgraded_data[upgraded_data.current_price > 1e6]

In [113]:
standard_stats = ['Goals',
                  'Assists',
                  'Goals + Assists',
                  'Non-Penalty Goals',
                  'Penalty Kicks Made',
                  'Penalty Kicks Attempted',
                  'Yellow Cards',
                  'Red Cards',
                  'xG: Expected Goals',
                  'npxG: Non-Penalty xG',
                  'xAG: Exp. Assisted Goals',
                  'npxG + xAG',
                  'Progressive Carries',
                  'Progressive Passes',
                  'Progressive Passes Rec']

shooting = ['Goals.1',
            'Shots Total',
            'Shots on Target',
            'Goals/Shot',
            'Goals/Shot on Target',
            'Average Shot Distance',
            'Shots from Free Kicks',
            'Penalty Kicks Made.1',
            'Penalty Kicks Attempted.1',
            'xG: Expected Goals.1',
            'npxG: Non-Penalty xG.1',
            'npxG/Shot',
            'Goals - xG',
            'Non-Penalty Goals - npxG']

passing =  ['Passes Completed',
            'Passes Attempted',
            'Total Passing Distance',
            'Progressive Passing Distance',
            'Passes Completed (Short)',
            'Passes Attempted (Short)',
            'Passes Completed (Medium)',
            'Passes Attempted (Medium)',
            'Passes Completed (Long)',
            'Passes Attempted (Long)',
            'Assists.1',
            'xAG: Exp. Assisted Goals.1',
            'xA: Expected Assists',
            'Key Passes',
            'Passes into Final Third',
            'Passes into Penalty Area',
            'Crosses into Penalty Area',
            'Progressive Passes.1']

pass_types = ['Passes Attempted.1',
              'Live-ball Passes',
              'Dead-ball Passes',
              'Passes from Free Kicks',
              'Through Balls',
              'Switches',
              'Crosses',
              'Throw-ins Taken',
              'Corner Kicks',
              'Inswinging Corner Kicks',
              'Outswinging Corner Kicks',
              'Straight Corner Kicks',
              'Passes Completed.1',
              'Passes Offside',
              'Passes Blocked']

creation = ['Shot-Creating Actions',
            'SCA (Live-ball Pass)',
            'SCA (Dead-ball Pass)',
            'SCA (Take-On)',
            'SCA (Shot)',
            'SCA (Fouls Drawn)',
            'SCA (Defensive Action)',
            'Goal-Creating Actions',
            'GCA (Live-ball Pass)',
            'GCA (Dead-ball Pass)',
            'GCA (Take-On)',
            'GCA (Shot)',
            'GCA (Fouls Drawn)',
            'GCA (Defensive Action)']

defense =  ['Tackles',
            'Tackles Won',
            'Tackles (Def 3rd)',
            'Tackles (Mid 3rd)',
            'Tackles (Att 3rd)',
            'Dribblers Tackled',
            'Dribbles Challenged',
            'Challenges Lost',
            'Blocks',
            'Shots Blocked',
            'Passes Blocked.1',
            'Interceptions',
            'Tkl+Int',
            'Clearances',
            'Errors']

possession = ['Touches',
              'Touches (Def Pen)',
              'Touches (Def 3rd)',
              'Touches (Mid 3rd)',
              'Touches (Att 3rd)',
              'Touches (Att Pen)',
              'Touches (Live-Ball)',
              'Take-Ons Attempted',
              'Successful Take-Ons',
              'Times Tackled During Take-On',
              'Carries',
              'Total Carrying Distance',
              'Progressive Carrying Distance',
              'Progressive Carries.1',
              'Carries into Final Third',
              'Carries into Penalty Area',
              'Miscontrols',
              'Dispossessed',
              'Passes Received',
              'Progressive Passes Rec.1']

other =  ['Yellow Cards.1',
          'Red Cards.1',
          'Second Yellow Card',
          'Fouls Committed',
          'Fouls Drawn',
          'Offsides',
          'Crosses.1',
          'Interceptions.1',
          'Tackles Won.1',
          'Penalty Kicks Won',
          'Penalty Kicks Conceded',
          'Own Goals',
          'Ball Recoveries',
          'Aerials Won',
          'Aerials Lost']

In [114]:
new_features= []

for col in defense:
    col = col.strip('.1')
    best_players[f'Defender_{col}'] = (upgraded_data['Defender_Group'] != 0).astype(int) * upgraded_data[col]
    new_features.append(f'Defender_{col}')

for col in shooting:
    col = col.strip('.1')
    best_players[f'Striker_{col}'] = ((upgraded_data['Striker_Group'] != 0) | (upgraded_data['Winger_Group'] != 0)).astype(int) * upgraded_data[col]
    new_features.append(f'Striker_{col}')

for col in possession + passing:
    col = col.strip('.1')
    best_players[f'Midfielder_{col}'] = (upgraded_data['Midfielder_Group'] != 0).astype(int) * upgraded_data[col]
    new_features.append(f'Midfielder_{col}')

for col in creation:
    col = col.strip('.1')
    best_players[f'Creator_{col}'] = ((upgraded_data['Midfielder_Group'] != 0) | (upgraded_data['Winger_Group'] != 0)).astype(int) * upgraded_data[col]
    new_features.append(f'Creator_{col}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_players[f'Defender_{col}'] = (upgraded_data['Defender_Group'] != 0).astype(int) * upgraded_data[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_players[f'Defender_{col}'] = (upgraded_data['Defender_Group'] != 0).astype(int) * upgraded_data[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

In [115]:
numeric= ['Goals', 'Assists', 'Non-Penalty Goals', 'Penalty Kicks Made',
          'Penalty Kicks Attempted','Yellow Cards','Red Cards', 'xG: Expected Goals',
          'npxG: Non-Penalty xG', 'xAG: Exp. Assisted Goals','Progressive Carries',
          'Shots from Free Kicks', 'npxG/Shot', 'Goals - xG', 'Non-Penalty Goals - npxG',
          'Passes Completed', 'Passes Attempted','Total Passing Distance',
          'Progressive Passing Distance', 'Passes Completed (Short)', 'Passes Attempted (Short)',
          'Passes Completed (Medium)','Passes Attempted (Medium)',
          'Passes Completed (Long)','Passes Attempted (Long)',
          'xA: Expected Assists','Key Passes', 'Passes into Final Third',
          'Passes into Penalty Area','Crosses into Penalty Area','Live-ball Passes',
          'Dead-ball Passes','Passes from Free Kicks','Through Balls','Switches', 'Crosses',
          'Throw-ins Taken','Corner Kicks','Inswinging Corner Kicks','Outswinging Corner Kicks',
          'Straight Corner Kicks','Passes Offside','Passes Blocked','Shot-Creating Actions',
          'SCA (Live-ball Pass)','SCA (Dead-ball Pass)', 'SCA (Take-On)', 'SCA (Shot)',
          'SCA (Fouls Drawn)', 'SCA (Defensive Action)','Goal-Creating Actions','GCA (Live-ball Pass)',
          'GCA (Dead-ball Pass)','GCA (Take-On)', 'GCA (Shot)', 'GCA (Fouls Drawn)','GCA (Defensive Action)',
          'Tackles', 'Tackles Won','Tackles (Def 3rd)', 'Tackles (Mid 3rd)','Tackles (Att 3rd)',
          'Dribblers Tackled', 'Dribbles Challenged','Challenges Lost',
          'Blocks', 'Shots Blocked', 'Interceptions', 'Clearances','Errors','Touches',
          'Touches (Def Pen)', 'Touches (Def 3rd)','Touches (Mid 3rd)','Touches (Att 3rd)',
          'Touches (Att Pen)','Touches (Live-Ball)','Take-Ons Attempted','Successful Take-Ons',
          'Times Tackled During Take-On','Carries','Total Carrying Distance',
          'Progressive Carrying Distance', 'Carries into Final Third',
          'Carries into Penalty Area','Miscontrols','Dispossessed','Passes Received','Second Yellow Card',
          'Fouls Committed','Fouls Drawn','Offsides','Penalty Kicks Won', 'Penalty Kicks Conceded',
          'Own Goals','Ball Recoveries','Aerials Won','Aerials Lost',
          'age', 'height','player_agent', 'joined','outfitter', 'appearances', 'goals', 'assists',
          'substitutions_on','substitutions_off', 'yellow_cards', 'red_cards', 'penalty_goals',
          'minutes_played', 'individual','international', 'national', 'other', 'national_status',
          'national_matches', 'national_goals','national_assists', 'national_yellow_cards',
          'national_red_cards','avg_injuries_per_season','avg_injury_duration','is_injured',
          'last_injury_date', 'club_price', 'club_statistics_matches','club_statistics_goals',
          'club_statistics_pts','club_league_top_rank','club_league_lowest_rank','club_league_mean_rank',
          'foot_left', 'foot_right','trophies_total_score', 'trophies_average_score','trophies_max_score',
          'trophies_min_score','trophies_amount','club_trophies_total_score','club_trophies_average_score',
          'club_trophies_max_score','club_trophies_min_score','club_trophies_amount', 'Defender_Group',
          'Midfielder_Group','Striker_Group','Winger_Group','followers']

old = ['age', 'height','player_agent', 'joined','outfitter', 'appearances', 'goals', 'assists',
          'substitutions_on','substitutions_off', 'yellow_cards', 'red_cards', 'penalty_goals',
          'minutes_played', 'individual','international', 'national', 'other', 'national_status',
          'national_matches', 'national_goals','national_assists', 'national_yellow_cards',
          'national_red_cards','avg_injuries_per_season','avg_injury_duration','is_injured',
          'last_injury_date', 'club_price', 'club_statistics_matches','club_statistics_goals',
          'club_statistics_pts','club_league_top_rank','club_league_lowest_rank','club_league_mean_rank',
          'foot_left', 'foot_right','trophies_total_score', 'trophies_average_score','trophies_max_score',
          'trophies_min_score','trophies_amount','club_trophies_total_score','club_trophies_average_score',
          'club_trophies_max_score','club_trophies_min_score','club_trophies_amount', 'Defender_Group',
          'Midfielder_Group','Striker_Group','Winger_Group','followers']

cur_features = numeric + new_features + ['predicted_price']

X = best_players[cur_features].fillna(0)
y = best_players['current_price'] / 1e6

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

Качество получилось не самым лучшим, поэтому попробуем обратиться к другому подходу машинного обучения -- градиентному бустингу

In [80]:
def objective(trial):
    params = {
        'depth' :  trial.suggest_int('depth', 1, 16),
        'iterations' : trial.suggest_int('iterations', 15, 100),
        'learning_rate' : trial.suggest_float("learning_rate", 1e-4, 1e-1),
        'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-1, 1e1),
        'grow_policy' : trial.suggest_categorical('grow_policy', ["SymmetricTree", "Depthwise", "Lossguide"]),
        'min_data_in_leaf' : trial.suggest_int("min_data_in_leaf ", 3, 50),
        'max_leaves' : trial.suggest_int("max_leaves", 100, 5000),
        'nan_mode' : trial.suggest_categorical('nan_mode', ["Forbidden", "Min", "Max"]),
        'score_function' : trial.suggest_categorical('score_function', ["Cosine", "L2"]),

    }


    cat = CatBoostRegressor(depth=params['depth'],
                            iterations=params['iterations'],
                            learning_rate=params['learning_rate'],
                            l2_leaf_reg=params['l2_leaf_reg'],
                            grow_policy=params['grow_policy'],
                            min_data_in_leaf=params['min_data_in_leaf'] if params['grow_policy'] != 'Lossguide' else None,
                            max_leaves=params['max_leaves'] if params['grow_policy'] == 'Lossguide' else None,
                            nan_mode=params['nan_mode'],
                            score_function=params['score_function'])



    cat.fit(X_train,y_train,verbose=False)
    preds = cat.predict(X_val)
    return mean_absolute_error(y_val, preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study.best_params

[I 2024-05-11 11:56:39,381] A new study created in memory with name: no-name-dc472c09-dd48-42db-acf0-e92fa985e6eb
[I 2024-05-11 11:56:43,311] Trial 0 finished with value: 3.782249837788723 and parameters: {'depth': 8, 'iterations': 67, 'learning_rate': 0.070200078583673, 'l2_leaf_reg': 1.1348316177298294, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf ': 44, 'max_leaves': 101, 'nan_mode': 'Forbidden', 'score_function': 'Cosine'}. Best is trial 0 with value: 3.782249837788723.
[I 2024-05-11 11:56:51,586] Trial 1 finished with value: 5.449008492458759 and parameters: {'depth': 9, 'iterations': 76, 'learning_rate': 0.02087809095408776, 'l2_leaf_reg': 5.596477680948386, 'grow_policy': 'Depthwise', 'min_data_in_leaf ': 9, 'max_leaves': 3821, 'nan_mode': 'Max', 'score_function': 'Cosine'}. Best is trial 0 with value: 3.782249837788723.
[I 2024-05-11 11:56:55,658] Trial 2 finished with value: 6.57866500139265 and parameters: {'depth': 8, 'iterations': 69, 'learning_rate': 0.014936834605893

{'depth': 5,
 'iterations': 83,
 'learning_rate': 0.07823641322063134,
 'l2_leaf_reg': 2.2228287916472445,
 'grow_policy': 'Lossguide',
 'min_data_in_leaf ': 25,
 'max_leaves': 1022,
 'nan_mode': 'Forbidden',
 'score_function': 'L2'}

In [123]:
second_cat = CatBoostRegressor(depth=5,
                        iterations=83,
                        learning_rate=0.07823641322063134,
                        l2_leaf_reg=2.2228287916472445,
                        grow_policy='Lossguide',
                        max_leaves=1022,
                        min_data_in_leaf=25,
                        nan_mode='Forbidden',
                        score_function='L2')

second_cat.fit(X_train,y_train,verbose=False)
preds = second_cat.predict(X_test)
print("Gradient boosting best players with new features MAE = %.4f" % mean_absolute_error(y_test, preds))
preds = first_cat.predict(X_test[old])
print("Gradient boosting best players MAE = %.4f" % mean_absolute_error(y_test, preds))

Gradient boosting best players with new features MAE = 3.9277
Gradient boosting best players MAE = 4.0284


In [None]:
pkl_filename = "pickle_model.pkl" 
with open(pkl_filename, 'wb') as file: 
    pickle.dump(second_cat, file) 