In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [8]:
df = pd.read_excel("../input/dataset/All_metrics.xlsx")

In [9]:
df.rename(columns={"hist_player_score": "hist_player_performance"}, inplace=True)
df.index = df.game_id

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Drop 'game_id'
df = df.drop('game_id', axis=1)

# Set binary column based on 'first' values
df['first_bot'] = df['first'].isin(["HastyBot", "BetterBot", "STEEBot"]).astype(int)
df = df.drop('first', axis=1)

# One-hot encode categorical columns
categorical_columns = ['time_control_name', 'game_end_reason', 'lexicon', 'rating_mode', 'Bot_nickname']
df = pd.get_dummies(df, columns=categorical_columns)

# Drop 'Player_nickname' and 'created_at'
df = df.drop(['Player_nickname', 'created_at', "Bot_game", 'Player_game'], axis=1)

# Move 'Player_rating' to the last column
cols = [col for col in df.columns if col != 'Player_rating'] + ['Player_rating']
df = df[cols]

df

Unnamed: 0_level_0,initial_time_seconds,increment_seconds,max_overtime_minutes,game_duration_seconds,Bot_turns_count,Player_turns_count,Bot_max_point,Player_max_point,Bot_min_point,Player_min_point,...,lexicon_CSW21,lexicon_ECWL,lexicon_NSWL20,lexicon_NWL20,rating_mode_CASUAL,rating_mode_RATED,Bot_nickname_BetterBot,Bot_nickname_HastyBot,Bot_nickname_STEEBot,Player_rating
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43828,15,0,1,61.405180,16,16,72,14,0,-10,...,True,False,False,False,False,True,False,True,False,1979.0
5765,1200,0,1,1268.252752,11,12,92,35,16,-10,...,True,False,False,False,False,True,True,False,False,1447.0
40264,1200,0,1,1262.786704,12,12,74,36,8,-10,...,True,False,False,False,False,True,True,False,False,1408.0
44376,1200,0,1,1209.746654,15,16,102,75,8,-10,...,True,False,False,False,False,True,True,False,False,1372.0
32633,1200,0,1,1231.166683,15,16,85,85,6,-10,...,True,False,False,False,False,True,True,False,False,1367.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7264,1200,0,1,683.258274,13,14,77,78,10,0,...,True,False,False,False,False,True,False,True,False,1855.0
66022,1380,0,1,626.274697,13,13,78,66,0,0,...,False,False,False,True,False,True,False,False,True,1641.0
2581,1380,0,1,775.765387,12,11,70,76,13,17,...,False,False,False,True,False,True,False,False,True,1637.0
54204,1200,0,1,624.940244,14,14,70,73,0,0,...,False,False,False,True,False,True,False,False,True,1626.0


In [11]:
train = df[df["Player_rating"].isnull()==False]
test = df[df["Player_rating"].isnull()==True]

In [12]:
train.shape

(43424, 63)

In [13]:
test.shape

(22363, 63)

In [14]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df_selected is your DataFrame
train_X = train.drop('Player_rating', axis=1)
train_y = train['Player_rating']

# Create a pipeline with an XGBRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor())
])

# Define parameter grid for grid search
param_grid = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 6, 9, 15, 20],
    'xgb__learning_rate': [0.05, 0.1, 0.2]
}

# Define RMSE as the scoring metric for GridSearchCV
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred)**2)))

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring=rmse_scorer, n_jobs=-1)

# Fit the model
grid_search.fit(train_X, train_y)

# Get the best parameters
best_params_xgb = grid_search.best_params_
print("Best Parameters:", best_params_xgb)

# Get the best model
best_model_xgb = grid_search.best_estimator_
print("Best Model:", best_model_xgb)

# Get cross-validation scores
print("Running cross-validation. This may take some time...")
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # Specify the same number of splits as the GridSearchCV
fold = 1
for train_index, test_index in kf.split(train_X):
    X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
    y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]

    best_model_xgb.fit(X_train, y_train)
    y_pred = best_model_xgb.predict(X_test)

    fold_rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(f"Fold {fold}: RMSE = {fold_rmse}")
    fold += 1



Best Parameters: {'xgb__learning_rate': 0.05, 'xgb__max_depth': 20, 'xgb__n_estimators': 50}
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.05,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=20, max_leaves=None,
                              min_child_weight=None, miss

In [15]:
test_X = test.drop('Player_rating', axis=1)
test_y = test['Player_rating']

In [16]:
test_X.shape

(22363, 62)

In [17]:
test_y.shape

(22363,)

In [18]:
test_X["Predictions"] = best_model_xgb.predict(test_X)

In [19]:
test_X["game_id"] = test_X.index
test_X

Unnamed: 0_level_0,initial_time_seconds,increment_seconds,max_overtime_minutes,game_duration_seconds,Bot_turns_count,Player_turns_count,Bot_max_point,Player_max_point,Bot_min_point,Player_min_point,...,lexicon_ECWL,lexicon_NSWL20,lexicon_NWL20,rating_mode_CASUAL,rating_mode_RATED,Bot_nickname_BetterBot,Bot_nickname_HastyBot,Bot_nickname_STEEBot,Predictions,game_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51857,1200,0,1,632.083319,15,14,74,93,4,6,...,False,False,False,False,True,True,False,False,1611.551147,51857
31730,1200,0,1,256.453327,15,14,93,12,10,2,...,False,False,False,False,True,False,True,False,1557.952271,31730
50150,1200,0,1,245.164982,18,17,76,24,0,2,...,False,False,False,False,True,False,True,False,1612.906982,50150
18710,780,0,1,798.728617,13,14,88,31,11,-10,...,False,False,False,True,False,False,False,True,1668.247070,18710
46342,780,0,1,841.983634,14,15,82,72,2,-29,...,False,False,False,True,False,False,False,True,1655.192505,46342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5506,900,0,1,471.077622,13,12,74,77,6,4,...,False,False,False,False,True,False,True,False,2048.639648,5506
26454,900,0,1,421.667981,12,12,81,71,6,4,...,False,False,False,False,True,False,True,False,2049.273682,26454
14102,900,0,1,295.437074,13,13,108,72,14,4,...,False,False,False,False,True,False,True,False,2038.825684,14102
69347,900,0,1,248.825438,12,12,77,87,14,0,...,False,False,False,False,True,False,True,False,2052.944336,69347


In [21]:
final_test_results = test_X[["game_id", "Predictions"]]
final_test_results.rename(columns={"Predictions": "rating"}, inplace=True)
final_test_results_reset = final_test_results.reset_index(drop=True)
final_test_results_sorted = final_test_results_reset.sort_values(by='game_id', ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_test_results.rename(columns={"Predictions": "rating"}, inplace=True)


In [22]:
csv_path = '/kaggle/working/final_result_xgb_all.csv'
final_test_results_sorted.to_csv(csv_path, index=False)