In [6]:
# !pip install catboost
# !pip install optuna

In [7]:
# import pandas as pd


# kaggle_dota_path = '/content/sample_data/dotaset_kaggle.csv'
# df_kaggle = pd.read_csv(kaggle_dota_path)
# df_kaggle['match_id'] = df_kaggle.index
# output_file_path = '/content/modified_dotaset_kaggle.csv'
# df_kaggle.to_csv(output_file_path, index=False)

# print(f"Modified file saved to {output_file_path}")

In [8]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import glob
import optuna

file_paths = glob.glob('/content/*.csv')
# file_paths = ['/content/dota9_81.csv']

dfs = [pd.read_csv(file) for file in file_paths]
df = pd.concat(dfs, ignore_index=True)
df_unique = df.drop_duplicates(subset='match_id')

output_file_path = '/content/sample_data/combined_dota.csv'
df_unique.to_csv(output_file_path, index=False)

In [9]:
df_unique.shape

(2600, 12)

In [None]:
X = df_unique.drop(columns=['match_id', 'radiant_win'])
y = df_unique['radiant_win']

first_five_cols = ['0', '1', '2', '3', '4']
last_five_cols = ['5', '6', '7', '8', '9']


ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(X[first_five_cols + last_five_cols].values.reshape(-1, 1))

first_five_encoded = ohe.transform(X[first_five_cols].values.reshape(-1, 1)).reshape(X.shape[0], -1, len(ohe.categories_[0])).sum(axis=1)
last_five_encoded = ohe.transform(X[last_five_cols].values.reshape(-1, 1)).reshape(X.shape[0], -1, len(ohe.categories_[0])).sum(axis=1)
combined_encoded = first_five_encoded - last_five_encoded

encoded_columns = [f'hero_{cat}' for cat in ohe.categories_[0]]
encoded_df = pd.DataFrame(combined_encoded, columns=encoded_columns)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size=0.2, random_state=42)

def objective(trial):
    param = {
          "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
          "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
          "depth": trial.suggest_int("depth", 1, 12),
          "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
          "bootstrap_type": trial.suggest_categorical(
              "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
          ),
          'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-4, 1e-1, log=True),
          "used_ram_limit": "3gb",
          'random_strength': 42,
          'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
          'border_count': trial.suggest_int('border_count', 32, 255),
          'eval_metric': 'Logloss',
      }
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    model = CatBoostClassifier(**param, verbose=0)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=0)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

print("Best parameters:", study.best_params)

best_params = study.best_params
model = CatBoostClassifier(**best_params, eval_metric='Logloss', verbose=0)
model.fit(X_train, y_train)

model_file_path = 'catboost_model_pipeline.joblib'
joblib.dump(model, model_file_path)
encoder_file_path = 'onehot_encoder.joblib'
joblib.dump(ohe, encoder_file_path)



[I 2024-06-07 17:02:41,001] A new study created in memory with name: no-name-4109914d-a1a6-4618-9b2c-fc0cc4c89782
[I 2024-06-07 17:02:41,550] Trial 0 finished with value: 0.5115384615384615 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.0331508754950669, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'l2_leaf_reg': 0.0020515268049833848, 'learning_rate': 0.003982902648654658, 'border_count': 182}. Best is trial 0 with value: 0.5115384615384615.
[I 2024-06-07 17:02:43,644] Trial 1 finished with value: 0.5096153846153846 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.051445657637614, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'l2_leaf_reg': 0.011850547704187638, 'learning_rate': 0.002023501832461677, 'border_count': 134, 'bagging_temperature': 4.72810153455811}. Best is trial 0 with value: 0.5115384615384615.
[I 2024-06-07 17:02:50,453] Trial 2 finished with value: 0.5076923076923077 and parameters: {'objective'

Best parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08636970893056112, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'l2_leaf_reg': 0.0005739849411010852, 'learning_rate': 0.008214995349459315, 'border_count': 221}


['onehot_encoder.joblib']

In [14]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 50
Best trial:
  Value: 0.5403846153846154
  Params: 
    objective: Logloss
    colsample_bylevel: 0.08636970893056112
    depth: 5
    boosting_type: Ordered
    bootstrap_type: MVS
    l2_leaf_reg: 0.0005739849411010852
    learning_rate: 0.008214995349459315
    border_count: 221


In [12]:
loaded_model_pipeline = joblib.load(model_file_path)
loaded_encoder = joblib.load(encoder_file_path)

y_pred = loaded_model_pipeline.predict(X_test)
y_pred_proba = loaded_model_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.55
Precision: 0.5380710659898477
Recall: 0.803030303030303
F1 Score: 0.6443768996960486


In [15]:
new_data = {'0': 'axe', '1': 'wraith_king', '2': 'shadow_shaman', '3': 'centaur_warrunner', '4': 'sand_king',
            '5': 'naga_siren', '6': 'lone_druid', '7': 'batrider', '8': 'pangolier', '9': 'morphling'}
new_data_df = pd.DataFrame([new_data])

new_first_five_encoded = loaded_encoder.transform(new_data_df[first_five_cols].values.reshape(-1, 1)).reshape(new_data_df.shape[0], -1, len(loaded_encoder.categories_[0])).sum(axis=1)
new_last_five_encoded = loaded_encoder.transform(new_data_df[last_five_cols].values.reshape(-1, 1)).reshape(new_data_df.shape[0], -1, len(loaded_encoder.categories_[0])).sum(axis=1)
new_combined_encoded = new_first_five_encoded - new_last_five_encoded

new_encoded_df = pd.DataFrame(new_combined_encoded, columns=encoded_columns)


prediction = loaded_model_pipeline.predict_proba(new_encoded_df)[:, 1]
print("Prediction for new data:", prediction)

Prediction for new data: [0.63084416]
