In [11]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle

player_df = pd.read_csv('../data/model_dataset/player_df_dummies.csv', index_col='unique_id')



In [7]:
player_df.columns

Index(['Unnamed: 0', 'full_name', 'minutes', 'goals_scored', 'assists', 'bps',
       'selected_by_percent', 'previous_start_value', 'delta_value',
       'target_value', 'position_DEF', 'position_FWD', 'position_GK',
       'position_MID', 'teams_arsenal', 'teams_astonvilla',
       'teams_bournemouth', 'teams_brentford', 'teams_brighton&hovealbion',
       'teams_burnley', 'teams_cardiffcity', 'teams_chelsea',
       'teams_crystalpalace', 'teams_everton', 'teams_fulham',
       'teams_huddersfieldtown', 'teams_leedsunited', 'teams_leicestercity',
       'teams_liverpool', 'teams_manchestercity', 'teams_manchesterunited',
       'teams_newcastleunited', 'teams_norwichcity', 'teams_nottinghamforest',
       'teams_sheffieldunited', 'teams_southampton', 'teams_stokecity',
       'teams_swanseacity', 'teams_tottenhamhotspur', 'teams_watford',
       'teams_westbromwichalbion', 'teams_westhamunited',
       'teams_wolverhamptonwanderers'],
      dtype='object')

In [8]:
player_df.drop(columns=['target_value', 'Unnamed: 0']).head()

Unnamed: 0_level_0,full_name,minutes,goals_scored,assists,bps,selected_by_percent,previous_start_value,delta_value,position_DEF,position_FWD,position_GK,position_MID,teams_arsenal,teams_astonvilla,teams_bournemouth,teams_brentford,teams_brighton&hovealbion,teams_burnley,teams_cardiffcity,teams_chelsea,teams_crystalpalace,teams_everton,teams_fulham,teams_huddersfieldtown,teams_leedsunited,teams_leicestercity,teams_liverpool,teams_manchestercity,teams_manchesterunited,teams_newcastleunited,teams_norwichcity,teams_nottinghamforest,teams_sheffieldunited,teams_southampton,teams_stokecity,teams_swanseacity,teams_tottenhamhotspur,teams_watford,teams_westbromwichalbion,teams_westhamunited,teams_wolverhamptonwanderers
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
7-2017-18,héctorbellerín,3050.0,2.0,4.0,656.0,10.3,60.0,-1.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11-2017-18,robholding,819.0,0.0,1.0,196.0,0.7,50.0,-6.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16-2017-18,mesutözil,2161.0,4.0,9.0,506.0,2.1,95.0,-2.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18-2017-18,aaronramsey,1844.0,7.0,10.0,503.0,4.5,70.0,-2.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21-2017-18,alexiwobi,1827.0,3.0,5.0,426.0,1.0,55.0,-4.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# Assuming player_df has the features and target column you want to use for training
# Replace 'target_column' with the name of your target column
features = player_df.drop(columns=['target_value', 'Unnamed: 0', 'full_name'])
target = player_df['target_value']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the CatBoost regression model
model = CatBoostRegressor(iterations=1000,  # Adjust the number of iterations as needed
                          learning_rate=0.1,  # Adjust the learning rate as needed
                          depth=5,  # Adjust the depth of the trees as needed
                          loss_function='RMSE',  # Using Root Mean Squared Error as the loss function
                          random_seed=42)

# Fit the model to the training data
model.fit(X_train, y_train, verbose=100)  # You can adjust the verbose level to see more/less output during training

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error to evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Now, you can use the trained model (model) to make predictions on new data
# For example:
# new_data = pd.DataFrame(...)  # Create a new DataFrame with features for prediction
# predictions = model.predict(new_data)

0:	learn: 12.5432798	total: 62.8ms	remaining: 1m 2s
100:	learn: 4.3007302	total: 120ms	remaining: 1.07s
200:	learn: 3.6826772	total: 181ms	remaining: 721ms
300:	learn: 3.2359065	total: 243ms	remaining: 563ms
400:	learn: 2.8554475	total: 307ms	remaining: 459ms
500:	learn: 2.5718966	total: 368ms	remaining: 366ms
600:	learn: 2.2985027	total: 430ms	remaining: 285ms
700:	learn: 2.0906864	total: 490ms	remaining: 209ms
800:	learn: 1.9476992	total: 553ms	remaining: 137ms
900:	learn: 1.7819149	total: 612ms	remaining: 67.3ms
999:	learn: 1.6715958	total: 673ms	remaining: 0us
Mean Squared Error: 39.23076014747225


In [None]:
y_pred

In [None]:
y_test

In [None]:
y_pred_train = model.predict(X_train)

# Calculate the mean squared error to evaluate the model's performance
mse = mean_squared_error(y_train, y_pred_train)
print(f"Mean Squared Error: {mse}")

In [None]:
print(y_test)

In [13]:
with open('../models/catboostv1.pkl', 'wb') as file:
    pickle.dump(model, file)

In [15]:
# Load the saved model from the pickle file
with open('../models/catboostv1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)