In [1]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle

player_df = pd.read_csv('../data/model_dataset/player_df_dummies.csv', index_col='unique_id')



In [2]:
player_df.columns

Index(['Unnamed: 0', 'full_name', 'minutes', 'goals_scored', 'assists', 'bps',
       'selected_by_percent', 'previous_start_value', 'delta_value',
       'target_value', 'position_DEF', 'position_FWD', 'position_GK',
       'position_MID', 'teams_arsenal', 'teams_astonvilla',
       'teams_bournemouth', 'teams_brentford', 'teams_brighton&hovealbion',
       'teams_burnley', 'teams_cardiffcity', 'teams_chelsea',
       'teams_crystalpalace', 'teams_everton', 'teams_fulham',
       'teams_huddersfieldtown', 'teams_leedsunited', 'teams_leicestercity',
       'teams_liverpool', 'teams_manchestercity', 'teams_manchesterunited',
       'teams_newcastleunited', 'teams_norwichcity', 'teams_nottinghamforest',
       'teams_sheffieldunited', 'teams_southampton', 'teams_stokecity',
       'teams_swanseacity', 'teams_tottenhamhotspur', 'teams_watford',
       'teams_westbromwichalbion', 'teams_westhamunited',
       'teams_wolverhamptonwanderers'],
      dtype='object')

In [3]:
player_df.drop(columns=['target_value', 'Unnamed: 0']).head()

Unnamed: 0_level_0,full_name,minutes,goals_scored,assists,bps,selected_by_percent,previous_start_value,delta_value,position_DEF,position_FWD,...,teams_nottinghamforest,teams_sheffieldunited,teams_southampton,teams_stokecity,teams_swanseacity,teams_tottenhamhotspur,teams_watford,teams_westbromwichalbion,teams_westhamunited,teams_wolverhamptonwanderers
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7-2017-18,héctorbellerín,3050.0,2.0,4.0,656.0,10.3,60.0,-1.0,1,0,...,0,0,0,0,0,0,0,0,0,0
11-2017-18,robholding,819.0,0.0,1.0,196.0,0.7,50.0,-6.0,1,0,...,0,0,0,0,0,0,0,0,0,0
16-2017-18,mesutözil,2161.0,4.0,9.0,506.0,2.1,95.0,-2.0,0,0,...,0,0,0,0,0,0,0,0,0,0
18-2017-18,aaronramsey,1844.0,7.0,10.0,503.0,4.5,70.0,-2.0,0,0,...,0,0,0,0,0,0,0,0,0,0
21-2017-18,alexiwobi,1827.0,3.0,5.0,426.0,1.0,55.0,-4.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Assuming player_df has the features and target column you want to use for training
# Replace 'target_column' with the name of your target column
features = player_df.drop(columns=['target_value', 'Unnamed: 0', 'full_name', 'bps', 'selected_by_percent'])
target = player_df['target_value']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the CatBoost regression model
model = CatBoostRegressor(iterations=1000,  # Adjust the number of iterations as needed
                          learning_rate=0.1,  # Adjust the learning rate as needed
                          depth=5,  # Adjust the depth of the trees as needed
                          loss_function='RMSE',  # Using Root Mean Squared Error as the loss function
                          random_seed=42)

# Fit the model to the training data
model.fit(X_train, y_train, verbose=100)  # You can adjust the verbose level to see more/less output during training

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error to evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Now, you can use the trained model (model) to make predictions on new data
# For example:
# new_data = pd.DataFrame(...)  # Create a new DataFrame with features for prediction
# predictions = model.predict(new_data)

0:	learn: 12.5169311	total: 60.4ms	remaining: 1m
100:	learn: 4.3634524	total: 105ms	remaining: 932ms
200:	learn: 3.7911322	total: 148ms	remaining: 590ms
300:	learn: 3.3680814	total: 195ms	remaining: 453ms
400:	learn: 3.0690114	total: 241ms	remaining: 360ms
500:	learn: 2.8045715	total: 287ms	remaining: 286ms
600:	learn: 2.5850001	total: 332ms	remaining: 220ms
700:	learn: 2.4053261	total: 382ms	remaining: 163ms
800:	learn: 2.2685345	total: 428ms	remaining: 106ms
900:	learn: 2.1348697	total: 474ms	remaining: 52ms
999:	learn: 2.0227415	total: 536ms	remaining: 0us
Mean Squared Error: 42.56764066996096


In [5]:
y_pred

array([ 55.64577867,  57.39677849,  48.33982935,  55.16875812,
        52.33337228,  58.3151863 ,  54.47979084,  73.0187356 ,
        54.4114348 ,  63.26396865,  56.53965138,  61.15992956,
        44.91222306,  52.45079298,  44.17247418,  52.98918888,
        49.9038465 ,  75.88745038,  63.73104345,  55.06639721,
        48.75799661,  47.18375618,  43.98971421,  44.32864612,
        69.05888122,  48.23717185,  50.56302849,  42.54940847,
        45.3401394 ,  45.97158094,  42.5695127 ,  52.42995695,
        47.67868026,  56.55240597,  44.36496158,  75.08316932,
        60.40385826,  67.41317871,  47.48416097,  46.2181553 ,
        59.22722094,  44.70385712,  51.4759154 ,  51.41192365,
        58.929162  ,  64.63514626,  49.70190772,  62.71261295,
        53.5749694 ,  52.90625169,  48.67037577,  49.17562441,
        49.31227317,  50.62468634,  49.66731995,  47.66030303,
        45.3672401 ,  47.65390989,  51.12115085,  57.87364001,
        57.41097958,  68.76554698,  56.01261159, 119.35

In [6]:
y_test

unique_id
346-2020-21    50.0
60-2021-22     50.0
323-2018-19    45.0
40-2018-19     55.0
58-2018-19     50.0
               ... 
416-2022-23    45.0
473-2022-23    50.0
31-2018-19     50.0
284-2018-19    55.0
141-2021-22    85.0
Name: target_value, Length: 283, dtype: float64

In [7]:
y_pred_train = model.predict(X_train)

# Calculate the mean squared error to evaluate the model's performance
mse = mean_squared_error(y_train, y_pred_train)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 4.091483048246382


In [8]:
print(y_test)

unique_id
346-2020-21    50.0
60-2021-22     50.0
323-2018-19    45.0
40-2018-19     55.0
58-2018-19     50.0
               ... 
416-2022-23    45.0
473-2022-23    50.0
31-2018-19     50.0
284-2018-19    55.0
141-2021-22    85.0
Name: target_value, Length: 283, dtype: float64


In [9]:
with open('../models/catboost_v0.0.1.pkl', 'wb') as file:
    pickle.dump(model, file)

In [10]:
# Load the saved model from the pickle file
with open('../models/catboost_v0.0.1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)