In [1]:
from pyts.classification import BOSSVS
from pyts.multivariate.classification import MultivariateClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
match_column = "source"
timestamp_column = "timestamp"
class_column = "winner"
non_data_columns = [match_column, timestamp_column, class_column]

In [2]:
df = pd.read_csv("dataset/SmokeSquadron/ss_winprediction/lpmp_dataset_5s.csv").drop(columns=["Unnamed: 0"])
data_columns = [x for x in df.columns if x not in non_data_columns]

In [3]:
data_index = []
# Get the unique sources in the DataFrame
unique_sources = df[match_column].unique()

# Iterate over each source
for source in unique_sources:
    source_subset = df[df[match_column] == source]
    data_index = data_index + list(source_subset.index[4:])

index_train, index_test, _, _ = train_test_split(data_index, [0]*len(data_index), test_size=0.33)

In [4]:
from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score)
import json

def create_output_dict():
    output_dict = {
        "parameters": [],
        "fold" : [],
        "timestamp" : [],
        "accuracy_train" : [],
        "precision_train" : [],
        "recall_train" : [],
        "f1_train": [],
        "accuracy_test" : [],
        "precision_test" : [],
        "recall_test" : [],
        "f1_test": []    
    }
    return output_dict

def add_metrics_to_output_dict(output_dict, param, fold, timestamp, y_train, y_train_hat, y_test, y_test_hat):
    acc_train = accuracy_score(y_train, y_train_hat)
    prec_train = precision_score(y_train, y_train_hat, average="macro")
    rec_train = recall_score(y_train, y_train_hat, average="macro")
    f1_train = f1_score(y_train, y_train_hat, average="macro")
    output_dict["parameters"].append(json.dumps(param))
    output_dict["fold"].append(fold)
    output_dict["timestamp"].append(timestamp)
    output_dict["accuracy_train"].append(acc_train)
    output_dict["precision_train"].append(prec_train)
    output_dict["recall_train"].append(rec_train)
    output_dict["f1_train"].append(f1_train)
    acc_test = accuracy_score(y_test, y_test_hat)
    prec_test = precision_score(y_test, y_test_hat, average="macro")
    rec_test = recall_score(y_test, y_test_hat, average="macro")
    f1_test = f1_score(y_test, y_test_hat, average="macro")
    output_dict["accuracy_test"].append(acc_test)
    output_dict["precision_test"].append(prec_test)
    output_dict["recall_test"].append(rec_test)
    output_dict["f1_test"].append(f1_test)
    return output_dict

### Random Forest Stratified Group 3-Fold

In [5]:
from sklearn.model_selection import StratifiedGroupKFold, ParameterGrid
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
import json

In [6]:
data_index = []
y = []
# Get the unique sources in the DataFrame
unique_sources = df[match_column].unique()

# Iterate over each source
for source in unique_sources:
    source_subset = df[df[match_column] == source]
    data_index.append(list(source_subset.index[4:]))
    y.append(source_subset.loc[source_subset.index[0],class_column])
assert len(data_index) == len(y)

In [7]:
parameters = {'n_estimators': [10, 50]}

In [44]:
sgkf = StratifiedGroupKFold(n_splits=3)
sgkf.get_n_splits(data_index, y)
output_dict = create_output_dict()
for fold, (train_index, test_index) in enumerate(sgkf.split(data_index, y, unique_sources)):
    print(f"fold {fold}")
    index_train = [x for idx in train_index for x in data_index[idx]]
    index_test = [x for idx in test_index for x in data_index[idx]]
    unique_timestamps = df['timestamp'].unique()[4:]
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for timestamp in unique_timestamps:
        #print(f"timestamp {unique_timestamp}")
        timestamp_subset = df[df['timestamp']==timestamp]
        ts_index_train = [x for x in timestamp_subset.index if x in list(index_train)]
        ts_index_test = [x for x in timestamp_subset.index if x in list(index_test)]
        for i in range(len(ts_index_train)):
            X_train.append(df.loc[ts_index_train[i]-4:ts_index_train[i],data_columns])
            y_train.append(df.loc[ts_index_train[i], class_column])
        for i in range(len(ts_index_test)):
            X_test.append(df.loc[ts_index_test[i]-4:ts_index_test[i], data_columns])
            y_test.append(df.loc[ts_index_test[i], class_column])
        #print(X_train, y_train)
        for param in list(ParameterGrid(parameters)):
            #print(param)
            lr_ridge = RandomForestClassifier(**param)
            clf = MultivariateClassifier(lr_ridge)
            clf.fit(X_train, y_train)
            y_train_hat = clf.predict(X_train)
            y_test_hat = clf.predict(X_test)
            output_dict = add_metrics_to_output_dict(output_dict, param, fold, timestamp, y_train, y_train_hat, y_test, y_test_hat)
            


fold 0


  _warn_prf(average, modifier, msg_start, len(result))


fold 1
fold 2


In [45]:
output_df = pd.DataFrame.from_dict(output_dict)
output_df.to_csv("rf2_sg3f.csv")
print(output_df.describe())

              fold    timestamp  accuracy_train  precision_train  \
count  1071.000000  1071.000000     1071.000000      1071.000000   
mean      1.000000   320.000000        0.725604         0.726602   
std       0.816878   171.835881        0.024023         0.030319   
min       0.000000    25.000000        0.662461         0.658915   
25%       0.000000   170.000000        0.706886         0.702136   
50%       1.000000   320.000000        0.727273         0.725371   
75%       2.000000   470.000000        0.732928         0.744760   
max       2.000000   615.000000        0.869565         0.911765   

       recall_train     f1_train  accuracy_test  precision_test  recall_test  \
count   1071.000000  1071.000000    1071.000000     1071.000000  1071.000000   
mean       0.704589     0.707253       0.501858        0.482203     0.480746   
std        0.028267     0.028604       0.067015        0.073160     0.060958   
min        0.629005     0.619226       0.405728        0.304348    

### Lr Stratified Group 3-fold + CSF

In [46]:
from skfeature.function.statistical_based import CFS

In [47]:
features = CFS.cfs(df.loc[:,data_columns].to_numpy(), df.loc[:,class_column])
print(features)

[ 5  8 12 15  6 13]


In [48]:
csf_data_columns = [data_columns[feature] for feature in features]
print(csf_data_columns)

['angular_movementdiff', 'angular_movementplayer02', 'distancediff', 'distanceplayer02', 'angular_movementplayer01', 'distanceplayer01']


In [49]:
sgkf = StratifiedGroupKFold(n_splits=3)
sgkf.get_n_splits(data_index, y)
output_dict = create_output_dict()
for fold, (train_index, test_index) in enumerate(sgkf.split(data_index, y, unique_sources)):
    print(f"fold {fold}")
    index_train = [x for idx in train_index for x in data_index[idx]]
    index_test = [x for idx in test_index for x in data_index[idx]]
    unique_timestamps = df['timestamp'].unique()[4:]
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for unique_timestamp in unique_timestamps:
        #print(f"timestamp {unique_timestamp}")
        timestamp_subset = df[df['timestamp']==unique_timestamp]
        ts_index_train = [x for x in timestamp_subset.index if x in list(index_train)]
        ts_index_test = [x for x in timestamp_subset.index if x in list(index_test)]
        for i in range(len(ts_index_train)):
            X_train.append(df.loc[ts_index_train[i]-4:ts_index_train[i],csf_data_columns])
            y_train.append(df.loc[ts_index_train[i], class_column])
        for i in range(len(ts_index_test)):
            X_test.append(df.loc[ts_index_test[i]-4:ts_index_test[i], csf_data_columns])
            y_test.append(df.loc[ts_index_test[i], class_column])
        for param in list(ParameterGrid(parameters)):
            lr_ridge = RidgeClassifier(**param)
            clf = MultivariateClassifier(lr_ridge)
            clf.fit(X_train, y_train)
            y_train_hat = clf.predict(X_train)
            y_test_hat = clf.predict(X_test)
            output_dict = add_metrics_to_output_dict(output_dict, param, fold, timestamp, y_train, y_train_hat, y_test, y_test_hat)


fold 0


fold 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


fold 2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [50]:
output_df = pd.DataFrame.from_dict(output_dict)
output_df.to_csv("rf2_sg3f_csf.csv")
print(output_df.describe())

              fold  timestamp  accuracy_train  precision_train  recall_train  \
count  1071.000000     1071.0     1071.000000      1071.000000   1071.000000   
mean      1.000000      615.0        0.616842         0.620623      0.563788   
std       0.816878        0.0        0.024154         0.055787      0.032944   
min       0.000000      615.0        0.576342         0.289557      0.497283   
25%       0.000000      615.0        0.600564         0.594007      0.542000   
50%       1.000000      615.0        0.612260         0.621344      0.562456   
75%       2.000000      615.0        0.627343         0.646159      0.580894   
max       2.000000      615.0        0.700312         0.825581      0.672307   

          f1_train  accuracy_test  precision_test  recall_test      f1_test  
count  1071.000000    1071.000000     1071.000000  1071.000000  1071.000000  
mean      0.525677       0.577518        0.605266     0.528165     0.475389  
std       0.055829       0.044970        0.12

## RF 10-fold

In [7]:
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
import json
from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score)

In [4]:
def create_output_dict():
    output_dict = {
        "parameters": [],
        "fold" : [],
        "timestamp" : [] 
    }
    return output_dict

def calculate_metrics(y, y_hat, average="macro"):
    acc = accuracy_score(y, y_hat)
    prec = precision_score(y, y_hat, average=average)
    rec = recall_score(y, y_hat, average=average)
    f1 = f1_score(y, y_hat, average=average)
    return acc, prec, rec, f1

def add_metrics_to_output_dict(output_dict, param, fold, timestamp, sample_sets):
    output_dict["parameters"].append(json.dumps(param))
    output_dict["fold"].append(fold)
    output_dict["timestamp"].append(timestamp)
    for sample_set in sample_sets:
        y, y_hat, set_name = sample_set[0], sample_set[1], sample_set[2]
        acc, prec, rec, f1 = calculate_metrics(y, y_hat)
        if f"accuracy_{set_name}" not in output_dict:
            output_dict[f"accuracy_{set_name}"] = []
        if f"precision_{set_name}" not in output_dict:
            output_dict[f"precision_{set_name}"] = []
        if f"recall_{set_name}" not in output_dict:
            output_dict[f"recall_{set_name}"] = []
        if f"f1_{set_name}" not in output_dict:
            output_dict[f"f1_{set_name}"] = []
        output_dict[f"accuracy_{set_name}"].append(acc)
        output_dict[f"precision_{set_name}"].append(prec)
        output_dict[f"recall_{set_name}"].append(rec)
        output_dict[f"f1_{set_name}"].append(f1)
    return output_dict

In [5]:
parameters = {'n_estimators': [10, 50]}

In [8]:
experiment_control_dict = create_output_dict()
output_dict = create_output_dict()
unique_timestamps = df['timestamp'].unique()[4:]
for timestamp in unique_timestamps:
    timestamp_subset = df[df['timestamp'] == timestamp]
    ts_index = [x for x in timestamp_subset.index]
    X = []
    y = []
    for i in range(len(timestamp_subset)):
        X.append(df.loc[ts_index[i]-4:ts_index[i], data_columns])
        y.append(df.loc[ts_index[i], class_column])
    #print(len(X), len(y))
    if len(X) <= 10:
        continue
    kf_train_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if y.count('player01') <= 10 or y.count('player02') <= 10:
            continue
    for test_fold, (train_val_index, test_index) in enumerate(kf_train_val.split(X, y)):
        x_trainval, y_trainval = [X[i] for i in train_val_index], [y[i] for i in train_val_index]
        x_test, y_test = [X[i] for i in test_index], [y[i] for i in test_index]
        
        skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
        best_f1_val = 0
        best_clf = None
        if y_trainval.count('player01') <= 9 or y_trainval.count('player02') <= 9:
            continue
        for fold, (train_index, val_index) in enumerate(skf.split(x_trainval, y_trainval)):
            X_train, y_train = [X[i] for i in train_index], [y[i] for i in train_index]
            X_val, y_val = [X[i] for i in val_index], [y[i] for i in val_index]
            for param in list(ParameterGrid(parameters)):
                rf = RandomForestClassifier(**param)
                clf = MultivariateClassifier(rf)
                clf.fit(X_train, y_train)
                y_train_hat = clf.predict(X_train)
                y_val_hat = clf.predict(X_val)
                acc_train, prec_train, rec_train, f1_train = calculate_metrics(y_train, y_train_hat)
                acc_val, prec_val, rec_val, f1_val = calculate_metrics(y_val, y_val_hat)
                experiment_control_dict = add_metrics_to_output_dict(
                    experiment_control_dict, param, fold, timestamp, 
                    [(y_train, y_train_hat, "train"), (y_val, y_val_hat, "val")])
                if f1_val > best_f1_val:
                    best_f1_val = f1_val
                    best_clf = clf
                    best_params = param
        y_test_hat = best_clf.predict(x_test)
        acc_test, prec_test, rec_test, f1_test = calculate_metrics(y_test, y_test_hat)
        output_dict = add_metrics_to_output_dict(
            output_dict, best_params, test_fold, timestamp,
            [(y_test, y_test_hat, "test")]
        )
    
experiment_df = pd.DataFrame.from_dict(experiment_control_dict)
experiment_df.to_csv("rf_sg3f_10fold_trainval.csv")
print(experiment_df.describe())

output_df = pd.DataFrame.from_dict(output_dict)
output_df.to_csv("rf_sg3f_10fold_test.csv")
print(output_df.describe())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              fold    timestamp  accuracy_train  precision_train  \
count  6282.000000  6282.000000     6282.000000      6282.000000   
mean      4.000000   109.756447        0.999920         0.999920   
std       2.582194    50.368136        0.001546         0.001614   
min       0.000000    25.000000        0.960000         0.950000   
25%       2.000000    65.000000        1.000000         1.000000   
50%       4.000000   110.000000        1.000000         1.000000   
75%       6.000000   155.000000        1.000000         1.000000   
max       8.000000   195.000000        1.000000         1.000000   

       recall_train     f1_train  accuracy_val  precision_val   recall_val  \
count   6282.000000  6282.000000   6282.000000    6282.000000  6282.000000   
mean       0.999916     0.999915      0.626945       0.569138     0.588471   
std        0.001659     0.001649      0.228957       0.276991     0.247696   
min        0.937500     0.952381      0.000000       0.000000     0.000000 

## RF +  CSF 10 fold

In [9]:
from skfeature.function.statistical_based import CFS

In [10]:
features = CFS.cfs(df.loc[:,data_columns].to_numpy(), df.loc[:,class_column])
print(features)

csf_data_columns = [data_columns[feature] for feature in features]
print(csf_data_columns)

[ 5  8 12 15  6 13]
['angular_movementdiff', 'angular_movementplayer02', 'distancediff', 'distanceplayer02', 'angular_movementplayer01', 'distanceplayer01']


In [11]:
experiment_control_dict = create_output_dict()
output_dict = create_output_dict()
unique_timestamps = df['timestamp'].unique()[4:]
for timestamp in unique_timestamps:
    timestamp_subset = df[df['timestamp'] == timestamp]
    ts_index = [x for x in timestamp_subset.index]
    X = []
    y = []
    for i in range(len(timestamp_subset)):
        X.append(df.loc[ts_index[i]-4:ts_index[i], csf_data_columns])
        y.append(df.loc[ts_index[i], class_column])
    #print(len(X), len(y))
    if len(X) <= 10:
        continue
    kf_train_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if y.count('player01') <= 10 or y.count('player02') <= 10:
        continue
    for test_fold, (train_val_index, test_index) in enumerate(kf_train_val.split(X, y)):
        x_trainval, y_trainval = [X[i] for i in train_val_index], [y[i] for i in train_val_index]
        x_test, y_test = [X[i] for i in test_index], [y[i] for i in test_index]
        
        skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
        best_f1_val = 0
        best_clf = None
        if y_trainval.count('player01') <= 9 or y_trainval.count('player02') <= 9:
            continue
        for fold, (train_index, val_index) in enumerate(skf.split(x_trainval, y_trainval)):
            X_train, y_train = [X[i] for i in train_index], [y[i] for i in train_index]
            X_val, y_val = [X[i] for i in val_index], [y[i] for i in val_index]
            for param in list(ParameterGrid(parameters)):
                rf = RandomForestClassifier(**param)
                clf = MultivariateClassifier(rf)
                clf.fit(X_train, y_train)
                y_train_hat = clf.predict(X_train)
                y_val_hat = clf.predict(X_val)
                acc_train, prec_train, rec_train, f1_train = calculate_metrics(y_train, y_train_hat)
                acc_val, prec_val, rec_val, f1_val = calculate_metrics(y_val, y_val_hat)
                experiment_control_dict = add_metrics_to_output_dict(
                    experiment_control_dict, param, fold, timestamp, 
                    [(y_train, y_train_hat, "train"), (y_val, y_val_hat, "val")])
                if f1_val > best_f1_val:
                    best_f1_val = f1_val
                    best_clf = clf
                    best_params = param
        y_test_hat = best_clf.predict(x_test)
        acc_test, prec_test, rec_test, f1_test = calculate_metrics(y_test, y_test_hat)
        output_dict = add_metrics_to_output_dict(
            output_dict, best_params, test_fold, timestamp,
            [(y_test, y_test_hat, "test")]
        )
    
experiment_df = pd.DataFrame.from_dict(experiment_control_dict)
experiment_df.to_csv("rf_sg3f_csf_10fold_trainval.csv")
print(experiment_df.describe())

output_df = pd.DataFrame.from_dict(output_dict)
output_df.to_csv("rf_sg3f_csf_10fold_test.csv")
print(output_df.describe())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              fold    timestamp  accuracy_train  precision_train  \
count  6282.000000  6282.000000     6282.000000      6282.000000   
mean      4.000000   109.756447        0.999671         0.999622   
std       2.582194    50.368136        0.002982         0.003466   
min       0.000000    25.000000        0.955556         0.944444   
25%       2.000000    65.000000        1.000000         1.000000   
50%       4.000000   110.000000        1.000000         1.000000   
75%       6.000000   155.000000        1.000000         1.000000   
max       8.000000   195.000000        1.000000         1.000000   

       recall_train     f1_train  accuracy_val  precision_val   recall_val  \
count   6282.000000  6282.000000   6282.000000    6282.000000  6282.000000   
mean       0.999706     0.999654      0.655776       0.613246     0.620491   
std        0.002708     0.003139      0.213153       0.257751     0.243814   
min        0.944444     0.953347      0.000000       0.000000     0.000000 