In [1]:
import numpy as np
import pandas as pd

In [None]:
import orjson as json
import tqdm
import multiprocessing as mp
import gc

In [None]:
from catboost import Pool, CatBoostClassifier

In [2]:
from features import game_features
from team_features import team_features
from permutation import permute_players, permute_teams

In [5]:
def job(line):
    game = json.loads(line)
    return game_features(game)

def mp_game_features(path_frm, path_to):
    pool = mp.Pool(mp.cpu_count())
    features = pool.map(job, open(path_frm))
    df = pd.DataFrame(features)
    df.to_csv(path_to, index=False)
    pool.close()
    return df

In [None]:
train_df = mp_game_features('../data/train_matches.jsonl', '../data/train_features_total.csv')

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 412, in _handle_workers
    pool._maintain_pool()
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 248, in _maintain_pool
    self._repopulate_pool()
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 241, in _repopulate_pool
    w.start()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 112, in start
    self._popen = self._Popen(self)
  File "/usr/lib/python3.7/multiprocessing/context.py", line 277, in _Popen
    return Popen(process_obj)
  File "/usr/lib/python3.7/multiprocessing/popen_fork.py", line 20, in __init__
    self._launch(process_obj)
  File "/usr/lib/python3.7/multiprocessing/popen_fork.py", line 70, in _launch
    self.pid = os.fork(

In [None]:
test_df = mp_game_features('../data/test_matches.jsonl', '../data/test_features_total.csv')

In [None]:
train_team_df = team_features(train_df, agg=[np.min, np.median, np.max],
                  remove_hero_features=True, calcuate_differences=True,
                  remove_team_features=True)

train_team_df.to_csv('../data/train_team_features.csv', index=False)

In [None]:
test_team_df = team_features(test_df, agg=[np.min, np.median, np.max],
                  remove_hero_features=True, calcuate_differences=True,
                  remove_team_features=True)

test_team_df.to_csv('../data/test_team_features.csv', index=False)

In [None]:
object_cols = train_df.select_dtypes(include=[np.object]).columns
int_cols = train_df.select_dtypes(include=[np.int64]).columns
float_cols = train_df.select_dtypes(include=[np.float64]).columns
dtype_map = {col: np.int16 for col in int_cols}
dtype_map.update({col: np.float16 for col in float_cols})

In [None]:
X = pd.read_csv('../data/train_features_total.csv', dtype=dtype_map)
X_test = pd.read_csv('../data/test_features_total.csv', dtype=dtype_map)
Y = pd.read_csv('../data/train_targets.csv')
submission_df = pd.read_csv('../data/sample_submission.csv')

In [None]:
object_cols = train_team_df.select_dtypes(include=[np.object]).columns
int_cols = train_team_df.select_dtypes(include=[np.int64]).columns
float_cols = train_team_df.select_dtypes(include=[np.float64]).columns
dtype_map = {col: np.int16 for col in int_cols}
dtype_map.update({col: np.float16 for col in float_cols})

In [None]:
train_team_df = pd.read_csv('../data/train_team_features.csv', dtype=dtype_map)
test_team_df = pd.read_csv('../data/test_team_features.csv', dtype=dtype_map)

In [None]:
X_team_unique = train_team_df[[c for c in train_team_df.columns if c not in X.columns]]
X_team_test = test_team_df[[c for c in test_team_df.columns if c not in X.columns]]

In [None]:
X_combined = pd.concat([X, X_team_unique], axis=1)
X_test_combined = pd.concat([X_test, X_team_test], axis=1)

In [None]:
del X, X_team_unique, X_team, X_team_test, X_test, train_df, test_df
gc.collect()

In [None]:
X_enh, Y_enh = permute_players(X_combined, Y)
del X_combined, Y
X_enh2, Y_enh2 = permute_teams(X_enh, Y_enh)
del X_enh, Y_enh

In [None]:
print(X_enh2.shape, Y_enh2.shape)

In [None]:
def train_and_predict(n_iterations, path_format='total_team_combined_{}.csv'):
    params = {
          'random_seed': 42,
          'n_estimators': n_iterations,
          'task_type': 'GPU',
          'verbose': 500,
          'one_hot_max_size': 130,
          }
    model = CatBoostClassifier(**params)
    model.fit(X_enh2, Y_enh2, verbose=500, cat_features=categorical_columns)
    test_pool = Pool(X_test_combined, cat_features=categorical_columns)
    submission_df.radiant_win_prob = model.predict_proba(test_pool)[:, 1]
    submission_df.to_csv(path_format.format(n_iterations), index=False)
    # make copy of submission_df and return it

In [None]:
Y_enh2.radiant_win = Y_enh2.radiant_win.astype(np.float32)

In [None]:
model = CatBoostClassifier(**params)

In [None]:
model.fit(X_enh2, Y_enh2, verbose=500, cat_features=categorical_columns)

In [None]:
test_pool = Pool(X_test_combined, cat_features=categorical_columns)

In [None]:
submission_df.radiant_win_prob = model.predict_proba(test_pool)[:, 1]

In [None]:
submission_df.to_csv('total_team_combined_5k.csv', index=False)