In [27]:
from sklearn.model_selection import train_test_split
import pandas as pd
match_column = "source"
timestamp_column = "timestamp"
class_column = "winner"
non_data_columns = [match_column, timestamp_column, class_column]

In [28]:
df = pd.read_csv("dataset/SmokeSquadron/ss_winprediction/lpmp_dataset_5s.csv").drop(columns=["Unnamed: 0"])
data_columns = [x for x in df.columns if x not in non_data_columns]
print(df)

      airbrakesdiff  airbrakesplayer01  airbrakesplayer01gradient  \
0               0.0                  0                        0.0   
1               0.0                  0                        0.0   
2               0.0                  0                        0.0   
3               0.0                  0                        0.0   
4               0.0                  0                        0.0   
...             ...                ...                        ...   
2588            0.0                  0                        0.0   
2589            0.0                  0                        0.0   
2590            0.0                  0                        0.0   
2591            0.0                  0                        0.0   
2592            0.0                  0                        0.0   

      airbrakesplayer02  airbrakesplayer02gradient  angular_movementdiff  \
0                   0.0                        0.0              0.000005   
1                  

In [29]:
labels = list(range(24))
for data_col in data_columns:
    df[data_col] = pd.cut(df[data_col], bins=24, labels=labels)

In [30]:
print(df)

     airbrakesdiff airbrakesplayer01 airbrakesplayer01gradient  \
0               11                11                        11   
1               11                11                        11   
2               11                11                        11   
3               11                11                        11   
4               11                11                        11   
...            ...               ...                       ...   
2588            11                11                        11   
2589            11                11                        11   
2590            11                11                        11   
2591            11                11                        11   
2592            11                11                        11   

     airbrakesplayer02 airbrakesplayer02gradient angular_movementdiff  \
0                   11                        11                   13   
1                   11                        11             

In [31]:
data_index = []
# Get the unique sources in the DataFrame
unique_sources = df[match_column].unique()

# Iterate over each source
for source in unique_sources:
    source_subset = df[df[match_column] == source]
    data_index = data_index + list(source_subset.index[4:])

index_train, index_test, _, _ = train_test_split(data_index, [0]*len(data_index), test_size=0.33)

In [32]:
from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score)
import json

def create_output_dict():
    output_dict = {
        "parameters": [],
        "fold" : [],
        "timestamp" : [],
        "accuracy_train" : [],
        "precision_train" : [],
        "recall_train" : [],
        "f1_train": [],
        "accuracy_test" : [],
        "precision_test" : [],
        "recall_test" : [],
        "f1_test": []    
    }
    return output_dict

def add_metrics_to_output_dict(output_dict, param, fold, timestamp, y_train, y_train_hat, y_test, y_test_hat):
    acc_train = accuracy_score(y_train, y_train_hat)
    prec_train = precision_score(y_train, y_train_hat, average="macro")
    rec_train = recall_score(y_train, y_train_hat, average="macro")
    f1_train = f1_score(y_train, y_train_hat, average="macro")
    output_dict["parameters"].append(json.dumps(param))
    output_dict["fold"].append(fold)
    output_dict["timestamp"].append(timestamp)
    output_dict["accuracy_train"].append(acc_train)
    output_dict["precision_train"].append(prec_train)
    output_dict["recall_train"].append(rec_train)
    output_dict["f1_train"].append(f1_train)
    acc_test = accuracy_score(y_test, y_test_hat)
    prec_test = precision_score(y_test, y_test_hat, average="macro")
    rec_test = recall_score(y_test, y_test_hat, average="macro")
    f1_test = f1_score(y_test, y_test_hat, average="macro")
    output_dict["accuracy_test"].append(acc_test)
    output_dict["precision_test"].append(prec_test)
    output_dict["recall_test"].append(rec_test)
    output_dict["f1_test"].append(f1_test)
    return output_dict

### ASM Stratified Group 3-Fold

In [33]:
from sklearn.model_selection import StratifiedGroupKFold, ParameterGrid
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import json

In [34]:
data_index = []
y = []
# Get the unique sources in the DataFrame
unique_sources = df[match_column].unique()

# Iterate over each source
for source in unique_sources:
    source_subset = df[df[match_column] == source]
    data_index.append(list(source_subset.index[4:]))
    y.append(source_subset.loc[source_subset.index[0],class_column])
assert len(data_index) == len(y)

In [35]:
import sys
np.set_printoptions(threshold=sys.maxsize)
def calculate_transition_matrices(X, n_states, class_labels):
    # Number of classes in Y
    n_classes = 2
    # Initialize dictionary to store transition matrices for each column
    transition_matrices = {}
    
    # For each column in X
    for col in X.columns[:-2]:
        #print(col)
        # Initialize transition matrix for the current column
        transition_matrix_col = np.zeros((n_classes, n_states, n_states))
        
        for match in X.iloc[:,-2].unique():   
            # Filter time series data for the current class
            #print(class_labels[c])
            X_c = X.loc[X[match_column]==match]
            #print(class_labels)
            c = class_labels.index(X_c[class_column].values[0])
            X_c = X_c[col]
            # Count the transitions to other states
            for t in range(1, len(X_c)):
                prev_state = X_c.iloc[t - 1]
                current_state = X_c.iloc[t]
                transition_matrix_col[c, prev_state, current_state] += 1
        # Normalize the transition matrices
        
        transition_matrix_col /= np.sum(transition_matrix_col, axis=2, keepdims=True)
        
        # Replace NaN values with 1/n_states (this happens when a state does not appear in the data)
        transition_matrix_col = np.nan_to_num(transition_matrix_col, nan=1.0/n_states)
        #print(transition_matrix_col)
        # Store the transition matrix for the current column in the dictionary
        transition_matrices[col] = transition_matrix_col

    return transition_matrices

In [61]:
def calculate_posterior_probabilities(X, transition_matrices, labels):
    # Number of classes in Y
    n_classes = len(labels)
    
    y_hat = []
    
    for X_sample in X:
        prior_Y = [1/n_classes]*n_classes
    
        # Initialize dictionary to store posterior probabilities for each class in Y
        posterior_probs = {}
    
        # For each class in Y
        for c in range(n_classes):
            # Initialize posterior probability for the current class
            posterior_prob_c = prior_Y[c]
            
            # For each column in X
            for col, transition_matrix_col in transition_matrices.items():
                # Get the transition matrix for the current column and class
                transition_matrix_c = transition_matrix_col[c]
                
                # Get the states for the current column in X_new
                states = X_sample[col].values
                
                # Update the posterior probability using the transition probabilities for the states
                for t in range(1, len(states)):
                    prev_state = states[t - 1]
                    current_state = states[t]
                    posterior_prob_c *= transition_matrix_c[prev_state, current_state]
            
            # Store the posterior probability for the current class in the dictionary
            posterior_probs[c] = posterior_prob_c
    
        # Normalize the posterior probabilities
        sum_posterior_probs = sum(posterior_probs.values())
        #print(sum_posterior_probs)
        for c in posterior_probs:
            posterior_probs[c] /= sum_posterior_probs
        label_index = np.argmax(list(posterior_probs.values()))
        y_hat.append(labels[label_index])
    return y_hat

In [63]:
import warnings
warnings.filterwarnings('ignore')

sgkf = StratifiedGroupKFold(n_splits=3)
sgkf.get_n_splits(data_index, y)
output_dict = create_output_dict()
dif_data_columns = [x for x in data_columns if 'dif' in x]
class_labels = list(df[class_column].unique())
for fold, (train_index, test_index) in enumerate(sgkf.split(data_index, y, unique_sources)):
    print(f"fold {fold}")
    index_train = [x for idx in train_index for x in data_index[idx]]
    index_test = [x for idx in test_index for x in data_index[idx]]
    X_train = df.loc[index_train, dif_data_columns+[match_column, class_column]]
    y_train = df.loc[index_train, class_column]
    transition_matrices = calculate_transition_matrices(X_train, 24, class_labels)
    unique_timestamps = df['timestamp'].unique()[4:]
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    y_train_hat = []
    y_test_hat = []
    for timestamp in unique_timestamps:
        #print(f"timestamp {unique_timestamp}")
        timestamp_subset = df[df['timestamp']==timestamp]
        ts_index_train = [x for x in timestamp_subset.index if x in list(index_train)]
        ts_index_test = [x for x in timestamp_subset.index if x in list(index_test)]
        for i in range(len(ts_index_train)):
            X_train.append(df.loc[ts_index_train[i]-4:ts_index_train[i],dif_data_columns])
            y_train.append(df.loc[ts_index_train[i], class_column])
        for i in range(len(ts_index_test)):
            X_test.append(df.loc[ts_index_test[i]-4:ts_index_test[i], dif_data_columns])
            y_test.append(df.loc[ts_index_test[i], class_column])
        y_train_hat = calculate_posterior_probabilities(X_train, transition_matrices, class_labels)
        assert len(y_train_hat) == len(y_train)
        y_test_hat = calculate_posterior_probabilities(X_test, transition_matrices, class_labels)
        assert len(y_test_hat) == len(y_test)
        output_dict = add_metrics_to_output_dict(output_dict, None, fold, timestamp, y_train, y_train_hat, y_test, y_test_hat)
            


fold 0
fold 1
fold 2


In [64]:
output_df = pd.DataFrame.from_dict(output_dict)
output_df.to_csv("asm_sg3f.csv")
print(output_df.describe())

             fold   timestamp  accuracy_train  precision_train  recall_train  \
count  357.000000  357.000000      357.000000       357.000000    357.000000   
mean     1.000000  320.000000        0.752510         0.751006      0.756912   
std      0.817643  171.996701        0.041576         0.041112      0.041801   
min      0.000000   25.000000        0.521739         0.514286      0.514620   
25%      0.000000  170.000000        0.743396         0.742670      0.748704   
50%      1.000000  320.000000        0.769121         0.767922      0.773417   
75%      2.000000  470.000000        0.777431         0.774142      0.782439   
max      2.000000  615.000000        0.785525         0.786092      0.790029   

         f1_train  accuracy_test  precision_test  recall_test     f1_test  
count  357.000000     357.000000      357.000000   357.000000  357.000000  
mean     0.750408       0.506690        0.507402     0.507243    0.501067  
std      0.042150       0.033288        0.026944   