In [1]:
import os
from dateutil.parser import parse
import json

def train_test_split():
    """
    Load all ML input files, sort by begin_at, and split into train/test.
    Last `test_size` games are used for testing.
    Returns a dict: {"train": [...], "test": [...]}.
    """
    ml_input_dir = "data/ml_input_output"
    valid_dir = "data/games_valid"
    os.makedirs(ml_input_dir, exist_ok=True)
    os.makedirs(valid_dir, exist_ok=True)

    

    temp_list = []

    for filename in os.listdir(ml_input_dir):
        if not filename.endswith(".json"):
            continue

        game_id = filename.split(".")[0]

        # Load ML input
        ml_filepath = os.path.join(ml_input_dir, filename)
        with open(ml_filepath, "r") as f:
            ml_data = json.load(f)

        # Load raw valid game to get begin_at
        raw_filepath = os.path.join(valid_dir, f"{game_id}.json")
        with open(raw_filepath, "r") as f:
            raw_game = json.load(f)

        begin_at = parse(raw_game["begin_at"])

        temp_list.append({
            "input": ml_data["input"],
            "output": ml_data["output"],
            "begin_at": begin_at
        })

    # Sort by begin_at
    temp_list.sort(key=lambda x: x["begin_at"])

    # Only keep [input, output]
    X = [tuple(d["input"]) for d in temp_list]
    y = [d["output"] for d in temp_list]
    
    return X[:-100], X[-100:], y[:-100], y[-100:] 

X_train, X_test, y_train, y_test = train_test_split()

In [None]:
import numpy as np
import logging
from sklearn.linear_model import LogisticRegression

log = logging.getLogger(__name__)

class MLPipeline:
    def __init__(self):
        self.team_map: dict[int, int] = {}     # team_id -> column index
        self.player_map: dict[int, int] = {}   # player_id -> column index
        self.model: LogisticRegression = LogisticRegression(solver="liblinear")
        log.info("⚙️ MLPipeline initialized")

    def fit(self, X: list[tuple], y: list[int]):
        """
        Build mapping from unique teams and players to column indices.
        Transform X to bag-of-teams and bag-of-players and train logistic regression.
        """
        unique_teams = set()
        unique_players = set()

        for row in X:
            t1, t2, *players = row
            unique_teams.update([t1, t2])
            unique_players.update(players)

        self.team_map = {team_id: idx for idx, team_id in enumerate(sorted(unique_teams))}
        self.player_map = {player_id: idx for idx, player_id in enumerate(sorted(unique_players))}

        log.info(f"📝 Mappings created: {len(self.team_map)} teams, {len(self.player_map)} players")

        # Transform X into feature array
        X_transformed = self.transform(X)
        y = np.array(y)

        # Fit logistic regression
        self.model.fit(X_transformed, y)
        log.info("✅ Logistic regression trained on transformed X_train")

    def transform(self, X: list[tuple]) -> np.ndarray:
        n_samples = len(X)
        n_team = len(self.team_map)
        n_player = len(self.player_map)

        bag_of_teams = np.zeros((n_samples, n_team), dtype=np.int8)
        bag_of_players = np.zeros((n_samples, n_player), dtype=np.int8)

        for i, row in enumerate(X):
            t1, t2, *players = row

            t1_idx = self.team_map.get(t1)
            t2_idx = self.team_map.get(t2)
            if t1_idx is not None:
                bag_of_teams[i, t1_idx] = 1
            if t2_idx is not None:
                bag_of_teams[i, t2_idx] = -1

            for p_id in players[:5]:
                p_idx = self.player_map.get(p_id)
                if p_idx is not None:
                    bag_of_players[i, p_idx] = 1
            for p_id in players[5:]:
                p_idx = self.player_map.get(p_id)
                if p_idx is not None:
                    bag_of_players[i, p_idx] = -1

        return np.hstack([bag_of_teams, bag_of_players])

    def predict_proba(self, X: list[tuple]) -> np.ndarray:
        X_transformed = self.transform(X)
        return self.model.predict_proba(X_transformed)[:, 1]


ValueError: X has 2043 features, but LogisticRegression is expecting 5804 features as input.

ValueError: X has 2058 features, but LogisticRegression is expecting 5804 features as input.

In [22]:
((y_pred_proba > .5).astype(int) == y_test).mean()

np.float64(0.66)

In [24]:
ml_pipeline.model.coef_

array([[ 0.5799529 ,  0.35983474,  0.17797022, ..., -0.32962461,
        -0.32962461, -0.32962461]], shape=(1, 5804))

In [22]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train_encoded, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
(rfc.predict(X_test_encoded) == y_test).mean()

np.float64(0.6)

In [None]:
X.append([t1_id, t2_id] + ps1_id.tolist() + ps2_id.tolist())
    y.append(team1_win)
X = np.array(X)
y = np.array(y)

X_train = X[:-100]
X_test = X[-100:]
y_train = y[:-100]
y_test = y[-100:]

del X, y

# ----------------------------
# Train/Test split already done
# X_train, X_test, y_train, y_test
# ----------------------------

# ----------------------------
# Encode teams
# ----------------------------
team_ids_train = X_train[:, :2].flatten()
unique_teams = np.unique(team_ids_train)
team_to_idx = {team_id: i for i, team_id in enumerate(unique_teams)}

# Encode train teams
X_team_train = np.array([[team_to_idx[t1], team_to_idx[t2]] for t1, t2 in X_train[:, :2]], dtype=int)
# Encode test teams (use same mapping, unseen teams -> -1 or skip)
X_team_test = np.array([[team_to_idx.get(t1, -1), team_to_idx.get(t2, -1)] for t1, t2 in X_test[:, :2]], dtype=int)

# ----------------------------
# Encode players
# ----------------------------
player_ids_train = X_train[:, 2:].flatten()
unique_players = np.unique(player_ids_train)
player_to_idx = {player_id: i for i, player_id in enumerate(unique_players)}

# Encode train players
X_player_train = np.array([[player_to_idx[p] for p in row] for row in X_train[:, 2:]], dtype=int)
# Encode test players (use same mapping, unseen players -> -1 or skip)
X_player_test = np.array([[player_to_idx.get(p, -1) for p in row] for row in X_test[:, 2:]], dtype=int)

# ----------------------------
# Combine team + player encoding
# ----------------------------
X_train = np.hstack([X_team_train, X_player_train])
X_test = np.hstack([X_team_test, X_player_test])

print("X_train_encoded shape:", X_train.shape)
print("X_test_encoded shape:", X_test.shape)
print("Example X_train_encoded first row:", X_train[0])
print("Example y_train first target:", y_train[0])

import numpy as np

class BagEncoder:
    def __init__(self):
        self.id_to_idx = {}
        self.num_ids = 0

    def fit(self, X):
        # Get unique items (teams or players)
        unique_items = np.unique(X.ravel())
        self.id_to_idx = {item: i for i, item in enumerate(unique_items)}
        self.num_ids = len(unique_items)
        return self

    def transform(self, X, pos_neg=False):
        n_samples = X.shape[0]
        # Use integer dtype
        bag = np.zeros((n_samples, self.num_ids), dtype=np.int32)

        n_cols = X.shape[1]
        for i, row in enumerate(X):
            for j, item in enumerate(row):
                idx = self.id_to_idx.get(item)
                if idx is not None:
                    if pos_neg:
                        # +1 for first half, -1 for second half
                        bag[i, idx] = 1 if j < n_cols // 2 else -1
                    else:
                        bag[i, idx] = 1
        return bag

    def fit_transform(self, X, pos_neg=False):
        self.fit(X)
        return self.transform(X, pos_neg=pos_neg)


# ----------------------------
# Encode teams (first half +1, second half -1)
# ----------------------------
X_teams_train = X_train[:, :2]
team_enc = BagEncoder()
X_team_bag_train = team_enc.fit_transform(X_teams_train, pos_neg=True)

X_teams_test = X_test[:, :2]
X_team_bag_test = team_enc.transform(X_teams_test, pos_neg=True)

print("X_team_bag_train.shape:", X_team_bag_train.shape)
print("X_team_bag_test[:5]:\n", X_team_bag_test[:5])

# ----------------------------
# Encode players (just 1 if present)
# ----------------------------
X_players_train = X_train[:, 2:]
player_enc = BagEncoder()
X_player_bag_train = player_enc.fit_transform(X_players_train, pos_neg=False)

X_players_test = X_test[:, 2:]
X_player_bag_test = player_enc.transform(X_players_test, pos_neg=False)

print("X_player_bag_train.shape:", X_player_bag_train.shape)
print("X_player_bag_test[:5]:\n", X_player_bag_test[:5])

# ----------------------------
# Combine teams + players into one feature matrix
# ----------------------------
X_bag_train = np.hstack([X_team_bag_train, X_player_bag_train])
X_bag_test = np.hstack([X_team_bag_test, X_player_bag_test])

print("Combined bag shape:", X_bag_train.shape)



In [6]:
import numpy as np

class BagEncoder:
    def __init__(self):
        self.id_to_idx = {}
        self.num_ids = 0

    def fit(self, X):
        # Get unique items (teams or players)
        unique_items = np.unique(X.ravel())
        self.id_to_idx = {item: i for i, item in enumerate(unique_items)}
        self.num_ids = len(unique_items)
        return self

    def transform(self, X, pos_neg=False):
        n_samples = X.shape[0]
        # Use integer dtype
        bag = np.zeros((n_samples, self.num_ids), dtype=np.int32)

        n_cols = X.shape[1]
        for i, row in enumerate(X):
            for j, item in enumerate(row):
                idx = self.id_to_idx.get(item)
                if idx is not None:
                    if pos_neg:
                        # +1 for first half, -1 for second half
                        bag[i, idx] = 1 if j < n_cols // 2 else -1
                    else:
                        bag[i, idx] = 1
        return bag

    def fit_transform(self, X, pos_neg=False):
        self.fit(X)
        return self.transform(X, pos_neg=pos_neg)


# ----------------------------
# Encode teams (first half +1, second half -1)
# ----------------------------
X_teams_train = X_train[:, :2]
team_enc = BagEncoder()
X_team_bag_train = team_enc.fit_transform(X_teams_train, pos_neg=True)

X_teams_test = X_test[:, :2]
X_team_bag_test = team_enc.transform(X_teams_test, pos_neg=True)

print("X_team_bag_train.shape:", X_team_bag_train.shape)
print("X_team_bag_test[:5]:\n", X_team_bag_test[:5])

# ----------------------------
# Encode players (just 1 if present)
# ----------------------------
X_players_train = X_train[:, 2:]
player_enc = BagEncoder()
X_player_bag_train = player_enc.fit_transform(X_players_train, pos_neg=False)

X_players_test = X_test[:, 2:]
X_player_bag_test = player_enc.transform(X_players_test, pos_neg=False)

print("X_player_bag_train.shape:", X_player_bag_train.shape)
print("X_player_bag_test[:5]:\n", X_player_bag_test[:5])

# ----------------------------
# Combine teams + players into one feature matrix
# ----------------------------
X_bag_train = np.hstack([X_team_bag_train, X_player_bag_train])
X_bag_test = np.hstack([X_team_bag_test, X_player_bag_test])

print("Combined bag shape:", X_bag_train.shape)


X_team_bag_train.shape: (100, 22)
X_team_bag_test[:5]:
 [[ 0  0  0  0  1  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]]
X_player_bag_train.shape: (100, 102)
X_player_bag_test[:5]:
 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0

In [10]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(X_team_bag_train, y_train)
np.mean(logit.predict(X_team_bag_test) ==  y_test)

from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(X_player_bag_train, y_train)
np.mean(logit.predict(X_player_bag_test) ==  y_test)

np.float64(0.58)

In [12]:
X_player_bag_test.shape

(100, 225)

In [3]:
X_encoded

array([[  5,  20,  22, ...,  94, 100, 101],
       [  5,  20,  22, ...,  94, 100, 101],
       [  1,  12,  18, ...,   2,   3,  34],
       ...,
       [  2,   6,  18, ...,  45,  46,  47],
       [  2,   6,  18, ...,  45,  46,  47],
       [  4,  10,  27, ...,  57,  58,  61]], shape=(100, 12))

X_team_bag_train.shape: (100, 22)
X_team_bag_test[:5]:
 [[ 0  0  0  0  1  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]]
X_player_bag_train.shape: (100, 102)
X_player_bag_test[:5]:
 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0

np.float64(0.55)

np.float64(0.43)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_train_enc, y_train)



array([[0.44304762, 0.55695238],
       [0.50591667, 0.49408333],
       [0.44304762, 0.55695238],
       [0.97472222, 0.02527778],
       [0.50591667, 0.49408333],
       [0.14117857, 0.85882143],
       [0.97      , 0.03      ],
       [0.50591667, 0.49408333],
       [0.14117857, 0.85882143],
       [0.97      , 0.03      ],
       [0.33675   , 0.66325   ],
       [0.33675   , 0.66325   ],
       [0.33675   , 0.66325   ],
       [0.67030952, 0.32969048],
       [0.54455628, 0.45544372],
       [0.30527489, 0.69472511],
       [0.67030952, 0.32969048],
       [0.17878571, 0.82121429],
       [0.54455628, 0.45544372],
       [0.30527489, 0.69472511],
       [0.67030952, 0.32969048],
       [0.17878571, 0.82121429],
       [0.26645635, 0.73354365],
       [0.17878571, 0.82121429],
       [0.26645635, 0.73354365],
       [0.89432143, 0.10567857],
       [0.24919048, 0.75080952],
       [0.89432143, 0.10567857],
       [0.24919048, 0.75080952],
       [0.95014286, 0.04985714],
       [0.

In [10]:
(rfc.predict(X_test_enc) == y_test).mean()

np.float64(0.59)

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# ----------------------------
# Преобразуем данные в numpy
# ----------------------------
# X_train_enc, y_train, X_test_enc, y_test уже подготовлены
X_train_np = X_train_enc.astype(np.int32)
y_train_np = y_train.astype(np.float32)
X_test_np = X_test_enc.astype(np.int32)
y_test_np = y_test.astype(np.float32)

# ----------------------------
# Параметры
# ----------------------------
num_teams = UNK_TEAM + 1
num_players = UNK_PLAYER + 1
team_emb_dim = 16
player_emb_dim = 32
num_players_per_game = 10

# ----------------------------
# Keras Model
# ----------------------------
# Input
input_teams = Input(shape=(2,), name='teams')       # team1, team2
input_players = Input(shape=(num_players_per_game,), name='players')  # p1..p10

# Embeddings
team_emb_layer = Embedding(num_teams, team_emb_dim, input_length=2, name='team_emb')(input_teams)
team_emb_flat = Flatten()(team_emb_layer)

player_emb_layer = Embedding(num_players, player_emb_dim, input_length=num_players_per_game, name='player_emb')(input_players)
player_emb_flat = Flatten()(player_emb_layer)

# Concatenate
x = Concatenate()([team_emb_flat, player_emb_flat])

# Fully connected layers
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[input_teams, input_players], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ----------------------------
# Early stopping
# ----------------------------
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# ----------------------------
# Train
# ----------------------------
history = model.fit(
    {'teams': X_train_np[:, :2], 'players': X_train_np[:, 2:]},
    y_train_np,
    validation_split=0.1,
    batch_size=128,
    epochs=500,
    callbacks=[early_stop],
    verbose=2
)

# ----------------------------
# Predictions
# ----------------------------
preds_train = model.predict({'teams': X_train_np[:, :2], 'players': X_train_np[:, 2:]})
preds_test = model.predict({'teams': X_test_np[:, :2], 'players': X_test_np[:, 2:]})

print("Train predictions:", preds_train[:10].flatten())
print("Test predictions:", preds_test[:10].flatten())

# ----------------------------
# Extract embeddings
# ----------------------------
team_embeddings = model.get_layer('team_emb').get_weights()[0]
player_embeddings = model.get_layer('player_emb').get_weights()[0]

print("team_embeddings.shape:", team_embeddings.shape)
print("player_embeddings.shape:", player_embeddings.shape)


2025-10-12 10:21:01.491669: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-12 10:21:02.440125: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-12 10:21:05.609385: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-12 10:21:06.774063: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/500
270/270 - 3s - 10ms/step - accuracy: 0.5956 - loss: 0.6619 - val_accuracy: 0.5878 - val_loss: 0.6686
Epoch 2/500
270/270 - 1s - 4ms/step - accuracy: 0.6584 - loss: 0.6127 - val_accuracy: 0.5904 - val_loss: 0.6833
Epoch 3/500
270/270 - 1s - 4ms/step - accuracy: 0.6783 - loss: 0.5881 - val_accuracy: 0.5943 - val_loss: 0.6865
Epoch 4/500
270/270 - 1s - 4ms/step - accuracy: 0.6912 - loss: 0.5704 - val_accuracy: 0.5859 - val_loss: 0.7004
Epoch 5/500
270/270 - 1s - 4ms/step - accuracy: 0.7028 - loss: 0.5555 - val_accuracy: 0.5912 - val_loss: 0.7032
Epoch 6/500
270/270 - 1s - 4ms/step - accuracy: 0.7102 - loss: 0.5411 - val_accuracy: 0.5825 - val_loss: 0.7255
Epoch 7/500
270/270 - 1s - 4ms/step - accuracy: 0.7234 - loss: 0.5276 - val_accuracy: 0.5901 - val_loss: 0.7312
Epoch 8/500
270/270 - 1s - 4ms/step - accuracy: 0.7331 - loss: 0.5137 - val_accuracy: 0.5901 - val_loss: 0.7529
Epoch 9/500
270/270 - 1s - 4ms/step - accuracy: 0.7413 - loss: 0.5006 - val_accuracy: 0.5870 - val_loss

In [6]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds_test)

0.6268472906403941

In [9]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl (184.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.0/184.0 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.23.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting

In [5]:
X_train

array([[  3216,   3284,  17525, ...,  17811,  17833,  17834],
       [  3216,   3284,  17525, ...,  17811,  17833,  17834],
       [  3210,   3228,  17520, ...,  17499,  17500,  17543],
       ...,
       [125802, 126377,  20678, ...,  17501,  17543,  19666],
       [126709, 129501,  20569, ...,  21433,  21439,  25439],
       [  5793, 125751,  18715, ...,  20370,  21440,  23684]],
      shape=(38275, 12))