In [52]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'bi-ml-competition-2023:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F49431%2F5228520%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240311%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240311T140336Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0439a6647118ef93fac3b8d2f72135cfd433fa2ff3d0992a9bb24360bf7e359ce6346e0acb775939df8562e3d5ce4efb67bf65176232bd7293a8398aa85ebc4751da621a3cacef50ff5146703ffdff98d954899ee2baa8cf917f55500c0917d2f532b56b4417aa1ec82004552736d80fdc551ea663815efdb030039529fb4bbe7f4cfc7a9d859ef8ec998a3d5e8792b96eedf04e359fa2cc87bc5a691514009a2b428db00d65db322390551a6651afabef44689b1d40bfd1670fa5ff224f377ce4821e5f3b4f1a39b14007b4a62c1855d6e2f7348d17370c1dca2a4a17aa19cfeb7286b7427a95c9ca95d1190483075c33f893db9fc8526f59e53fee57f34673'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Failed to load https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/49431/5228520/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240311%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240311T140336Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=0439a6647118ef93fac3b8d2f72135cfd433fa2ff3d0992a9bb24360bf7e359ce6346e0acb775939df8562e3d5ce4efb67bf65176232bd7293a8398aa85ebc4751da621a3cacef50ff5146703ffdff98d954899ee2baa8cf917f55500c0917d2f532b56b4417aa1ec82004552736d80fdc551ea663815efdb030039529fb4bbe7f4cfc7a9d859ef8ec998a3d5e8792b96eedf04e359fa2cc87bc5a691514009a2b428db00d65db322390551a6651afabef44689b1d40bfd1670fa5ff224f377ce4821e5f3b4f1a39b14007b4a62c1855d6e2f7348d17370c1dca2a4a17aa19cfeb7286b7427a95c9ca95d1190483075c33f893db9fc8526f59e53fee57f34673 to path /kaggle/input/bi-ml-competition-2023
Data source import complete.


# <center> Предсказание победителя в Dota 2


In [54]:
import os
import json
import pandas as pd
import numpy as np
import datetime
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score
from sklearn.ensemble import (RandomForestClassifier,
                              VotingClassifier)
from sklearn.metrics import roc_auc_score, accuracy_score
import lightgbm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import catboost
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [55]:
SEED = 10801
sns.set_style(style="whitegrid")
plt.rcParams["figure.figsize"] = 12, 8
warnings.filterwarnings("ignore")

## <left>Описание данных

Файлы:

- `sample_submission.csv`: пример файла-посылки
- `train_raw_data.jsonl`, `test_raw_data.jsonl`: "сырые" данные
- `train_data.csv`, `test_data.csv`: признаки, созданные авторами
- `train_targets.csv`: результаты тренировочных игр

## <left>Описание признаков
    
Набор простых признаков, описывающих игроков и команды в целом

In [56]:
PATH_TO_DATA = "/kaggle/input/copy-of-23-24-ml/"

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA,
                                             "train_data.csv"),
                                    index_col="match_id_hash")
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA,
                                            "train_targets.csv"),
                                   index_col="match_id_hash")

## <left>Наша первая модель

In [57]:
X = df_train_features.values
y = df_train_targets["radiant_win"].values.astype("int8")

In [58]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.3,
                                                      random_state=SEED)

#### Обучим случайный лес

In [7]:
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
rf_model.fit(X_train, y_train)

#### Сделаем предсказания и оценим качество на отложенной части данных

In [8]:
y_pred = rf_model.predict_proba(X_valid)[:, 1]

In [15]:
valid_score = roc_auc_score(y_valid, y_pred)
print("ROC-AUC score на отложенной части:", valid_score)

ROC-AUC score на отложенной части: 0.7754387258058622


Посмотрим на accuracy:

In [16]:
valid_accuracy = accuracy_score(y_valid, y_pred > 0.5)
print("Accuracy score (p > 0.5) на отложенной части:", valid_accuracy)

Accuracy score (p > 0.5) на отложенной части: 0.6885383806519453


## <left>Кросс-валидация

Во многих случаях кросс-валидация оказывается лучше простого разбиения на test и train. Воспользуемся `ShuffleSplit` чтобы создать 5 70%/30% наборов данных.

In [59]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=SEED)

In [20]:
rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, n_jobs=-1, random_state=SEED)
cv_scores_rf = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc")

In [21]:
cv_scores_rf

array([0.77543873, 0.77343884, 0.76649967, 0.7722681 , 0.77246001])

In [104]:
print(f"Среднее значение ROC-AUC на кросс-валидации: {cv_scores_rf.mean()}")

Среднее значение ROC-AUC на кросс-валидации: 0.7720210676055513


#### Сделаем чтение файла с сырыми данными и добавление новых признаков удобным

In [60]:
try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ("Подумайте об установке ujson, чтобы работать с JSON объектами быстрее")

try:
    from tqdm.notebook import tqdm
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ("Подумайте об установке tqdm, чтобы следить за прогрессом")


def read_matches(matches_file, total_matches=31698, n_matches_to_read=None):
    """
    Аргуент
    -------
    matches_file: JSON файл с сырыми данными

    Результат
    ---------
    Возвращает записи о каждом матче
    """

    if n_matches_to_read is None:
        n_matches_to_read = total_matches

    c = 0
    with open(matches_file) as fin:
        for line in tqdm(fin, total=total_matches):
            if c >= n_matches_to_read:
                break
            else:
                c += 1
                yield json.loads(line)

## <left>Feature engineering

Добавил новые фичи:
1) `team_diff_KDA` 
    Посчитал KDA для двух команд и нашел разницу. Этот признак демонстрирует картину игры в момент времени, так как отражает килы, ассисты на одну смерть для всей команды сразу.
    
2) `runes_picked_by_radiant` and `roshans_killed_by_radiant`
    Два признака, которые дают преимущество в определенный момент времени. Считал процент от всех рун/рошанов, которыми воспользовались игроки сил света.
    
3) `mean_max_mana_diff`
    Эта фича отражает среднее количество манны в командах, что также несет под собой смысл, так как при большем количестве у одной из команд у них появлятся преимущество.


In [61]:
def add_new_features(df_features, matches_file):
    """
    Аргуенты
    -------
    df_features: таблица с данными
    matches_file: JSON файл с сырыми данными

    Результат
    ---------
    Добавляет новые признаки в таблицу
    """

    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Посчитаем количество разрушенных вышек обеими командами
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match["objectives"]:
            if objective["type"] == "CHAT_MESSAGE_TOWER_KILL":
                if objective["team"] == 2:
                    radiant_tower_kills += 1
                if objective["team"] == 3:
                    dire_tower_kills += 1
        kda_1 = 0            
        for player in match["players"][0:5]:
            kda_1 += ((player['kills'] + player['assists']) / player['deaths']) if player['deaths'] != 0 else (player['kills'] + player['assists'])
        
        kda_2 = 0
        for player in match["players"][5:10]:
            kda_2 += ((player['kills'] + player['assists']) / player['deaths']) if player['deaths'] != 0 else (player['kills'] + player['assists'])
            
        runes = []
        for player in match["players"]:
            runes.append(player["rune_pickups"])
            
        roshans = []
        for player in match["players"]:
            roshans.append(player["roshans_killed"])
        
        radiant_team_max_mana = [player["max_mana"] for player in match["players"][0:5]]
        dire_team_max_mana = [player["max_mana"] for player in match["players"][5:10]]
        
        df_features.loc[match_id_hash, "radiant_tower_kills"] = radiant_tower_kills
        df_features.loc[match_id_hash, "dire_tower_kills"] = dire_tower_kills
        df_features.loc[match_id_hash, "diff_tower_kills"] = radiant_tower_kills - dire_tower_kills
        df_features.loc[match_id_hash, "team_diff_KDA"] = kda_1 - kda_2
        df_features.loc[match_id_hash, "runes_picked_by_radiant"] = (sum(runes[0:5]) / sum(runes)) if sum(runes) != 0 else 0
        df_features.loc[match_id_hash, "roshans_killed_by_radiant"] = (sum(roshans[0:5]) / sum(roshans)) if sum(roshans) != 0 else 0
        df_features.loc[match_id_hash, "mean_max_mana_diff"] = np.mean(radiant_team_max_mana) - np.mean(dire_team_max_mana)
        # ... (/¯◡ ‿ ◡)/¯☆*:・ﾟ добавляем новые признаки ...

### Подсчет скора ROC-AUC

In [64]:
# Скопируем таблицу с признаками
df_train_features_extended = df_train_features.copy()

# Добавим новые
add_new_features(df_train_features_extended,
                 os.path.join(PATH_TO_DATA,
                              "train_raw_data.jsonl"))

  0%|          | 0/31698 [00:00<?, ?it/s]

Здесь я выбирал наилучшую модель и сначала рассмотрел несколько вариантов ансамблей, lightgbm был лучшим из них.

In [117]:
lgbm = lightgbm.LGBMClassifier(random_state=SEED)
lgbm.fit(df_train_features_extended, y)

[LightGBM] [Info] Number of positive: 16670, number of negative: 15028
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27492
[LightGBM] [Info] Number of data points in the train set: 31698, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.525901 -> initscore=0.103696
[LightGBM] [Info] Start training from score 0.103696


После я решил собрать свой ансамблю и перебрав несколько вариантов остановился на catboost+log regression

In [65]:
cat = catboost.CatBoostClassifier(verbose=0, random_seed=SEED)
lr = LogisticRegression(solver='liblinear', max_iter=10000)
base_models = [("CAT", cat), ("LR", lr)]
# voting_hard = VotingClassifier(estimators=base_models, voting='hard')


In [67]:
voting_soft = VotingClassifier(estimators=base_models, voting='soft')
# voting_soft.fit(df_train_features_extended, y)

In [None]:
# cv_scores_base = cross_val_score(rf_model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
cv_scores_extended = cross_val_score(voting_soft, df_train_features_extended.values, y,
                                     cv=cv, scoring="roc_auc", n_jobs=-1)

In [69]:
# print(f"ROC-AUC на кросс-валидации для базовых признаков: {cv_scores_base.mean()}")
print(f"ROC-AUC на кросс-валидации для новых признаков: {cv_scores_extended.mean()}")

ROC-AUC на кросс-валидации для новых признаков: 0.8107825681333803


### Файл для посылки

In [31]:
voting_soft.fit(df_train_features_extended, y)

df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, "test_data.csv"),
                                   index_col="match_id_hash")

df_test_features_extended = df_test_features.copy()

add_new_features(df_test_features_extended,
                 os.path.join(PATH_TO_DATA,
                              "test_raw_data.jsonl"))

X_test = df_test_features_extended.values
y_test_pred = voting_soft.predict_proba(X_test)[:, 1]

df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred},
                                 index=df_test_features_extended.index)

submission_filename = "submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))

  0%|          | 0/31698 [00:00<?, ?it/s]

Файл посылки сохранен, как: submission_2024-04-04_11-32-07.csv


Это немного, но это честная работа)