# Libraries / Settings

#### Libraries

In [1]:
# Data Manipulation
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from scipy import stats
from sklearn.model_selection import train_test_split
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import MinMaxScaler

# Modeling
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical

# Results
from sklearn.metrics import roc_auc_score, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import pylab as py
from tqdm import tqdm

# Web interface
from urllib.request import urlopen
import json

# Misc
import time
from IPython.display import clear_output
from pygame import mixer

pygame 2.1.3 (SDL 2.0.22, Python 3.10.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


### Settings for notebook

In [2]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

salty_data = 'C:/Users/nuke2/Desktop/Python/Projects/Data/SaltyBetter/salty_data_complete.csv' # Desktop
# salty_data = 'C:/Users/nuke2/Desktop/Python/Projects/Data/SaltyBetter/salty_data_complete.csv' # Laptop

# SaltyData API
match_url = "https://salty-boy.com/current-match"

test_train_split_percentage = 0.2
train_validation_split_percentage = 0.2

notebook_seed = 550

# Load notification sound
mixer.init()
mixer.music.load(r'C:\Users\nuke2\Desktop\Python\Projects\SaltyBetter\mixkit-gaming-lock-2848.wav')

# Data cleaning

#### Read data/drop unnecessary columns

In [3]:
# Read data into df_all
df = pd.read_csv(salty_data, low_memory=False)
df_all = df.copy()

# Drop unnecessary columns
df_all.drop(columns=[
    'name',
    'id',
    'prev_tier', # Dropped because duplicates
    'fighter_blue',
    'fighter_red',
    'bet_blue',
    'bet_red',
    'match_id',
    'streak_blue',
    'streak_red',
    'winner',
    'away',
    'name_other',
    'id_other',
    'prev_tier_other', # Dropped because duplicates
], inplace=True)

# Print details about df_all
print(f"Num rows where elo != tier elo") 
print(f" -- Home: {sum(df_all.elo != df_all.tier_elo)}")
print(f" -- Other: {sum(df_all.elo_other != df_all.tier_elo_other)}")

df_all.head()

Num rows where elo != tier elo
 -- Home: 95133
 -- Other: 94621


Unnamed: 0,best_streak,elo,tier,tier_elo,average_bet,total_matches,win_rate,match_format,match_tier,winner_binary,best_streak_other,elo_other,tier_other,tier_elo_other,average_bet_other,total_matches_other,win_rate_other
0,5,1560,S,1559,10870713.45,44,0.61,matchmaking,S,0.0,2.0,1338.0,S,1338.0,4979841.32,37.0,0.27
1,5,1560,S,1559,10870713.45,44,0.61,matchmaking,S,0.0,14.0,1544.0,S,1433.0,4989952.13,39.0,0.64
2,5,1560,S,1559,10870713.45,44,0.61,matchmaking,S,1.0,14.0,1702.0,S,1699.0,18188514.31,42.0,0.79
3,5,1560,S,1559,10870713.45,44,0.61,matchmaking,S,1.0,9.0,1623.0,S,1620.0,8723817.1,50.0,0.7
4,5,1560,S,1559,10870713.45,44,0.61,matchmaking,S,0.0,6.0,1587.0,S,1586.0,7656194.8,55.0,0.64


#### One-hot encode categorical columns

In [4]:
# define a list of columns to one-hot encode
cols_to_encode = ["tier", "match_format", "match_tier", "tier_other"]

# apply one-hot encoding to the selected columns
df_all = pd.get_dummies(df_all, columns=cols_to_encode)

# Drop extra columns
df_all.drop(columns=['tier_X', 
                     'tier_other_X', 
                     'match_format_matchmaking', 
                     'match_tier_X'
], inplace=True)

#### Test train validation split data

In [5]:
# split the dataframe into training, validation, and testing sets
df_train, df_test = train_test_split(df_all, test_size=test_train_split_percentage, 
                                     random_state=notebook_seed)
df_train, df_val = train_test_split(df_train, test_size=train_validation_split_percentage, 
                                    random_state=notebook_seed)

#### Standardize data

In [6]:
# fit the scaler on the training data
scaler = MinMaxScaler()
scaler.fit(df_train)

# transform the data
df_train = pd.DataFrame(scaler.transform(df_train), columns=df_all.columns)
df_val = pd.DataFrame(scaler.transform(df_val), columns=df_all.columns)
df_test = pd.DataFrame(scaler.transform(df_test), columns=df_all.columns)

# extract y test train validation sets
y_train = df_train.winner_binary
y_val = df_val.winner_binary
y_test = df_test.winner_binary

In [7]:
df_train

Unnamed: 0,best_streak,elo,tier_elo,average_bet,total_matches,win_rate,winner_binary,best_streak_other,elo_other,tier_elo_other,average_bet_other,total_matches_other,win_rate_other,tier_A,tier_B,tier_P,tier_S,match_format_tournament,match_tier_A,match_tier_B,match_tier_P,match_tier_S,tier_other_A,tier_other_B,tier_other_P,tier_other_S
0,0.289474,0.351906,0.358297,0.010915,0.476190,0.41,1.0,0.22,0.714074,0.730769,0.053416,0.500000,0.77,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.333333,0.486804,0.488987,0.010643,0.404762,0.54,1.0,0.23,0.576296,0.572485,0.034992,0.569767,0.64,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.377193,0.655425,0.663730,0.017464,0.226190,0.85,0.0,0.24,0.475556,0.476331,0.024738,0.290698,0.50,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.315789,0.265396,0.267254,0.008772,0.559524,0.38,0.0,0.18,0.134815,0.140533,0.016805,0.441860,0.15,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.289474,0.366569,0.374449,0.008960,0.607143,0.40,1.0,0.22,0.337778,0.340237,0.017605,0.604651,0.36,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77695,0.307018,0.388563,0.390602,0.012524,0.369048,0.47,1.0,0.22,0.555556,0.553254,0.031417,0.488372,0.60,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
77696,0.271930,0.054252,0.471366,0.009004,0.523810,0.16,1.0,0.29,0.746667,0.741124,0.050774,0.779070,0.69,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
77697,0.368421,0.652493,0.656388,0.018186,0.380952,0.73,0.0,0.22,0.434074,0.446746,0.019523,0.569767,0.56,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
77698,0.289474,0.225806,0.227606,0.004817,0.571429,0.33,1.0,0.21,0.437037,0.436391,0.027732,0.500000,0.50,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


# Model Fitting

#### XGBoost

In [8]:
xgb_columns = [
    # Home fighter
    'best_streak',
    'elo',
    'tier_A',
    'tier_B',
    'tier_P',
    'tier_S',
#     'tier_X', # Most common category
    'tier_elo',
    'average_bet',
    'total_matches',
    'win_rate',
    
    # Away fighter
    'best_streak_other',
    'elo_other',
    'tier_other_A',
    'tier_other_B',
    'tier_other_P',
    'tier_other_S',
#     'tier_other_X', # Most common category
    'tier_elo_other',
    'average_bet_other',
    'total_matches_other',
    'win_rate_other',
    
    # Match information
#     'match_format_matchmaking', # Most common category
    'match_format_tournament',
    'match_tier_A',
    'match_tier_B',
    'match_tier_P',
    'match_tier_S',
#     'match_tier_X', # Most common category
]

xgb_train = df_train[xgb_columns]
xgb_val = df_val[xgb_columns]
xgb_test = df_test[xgb_columns]

In [9]:
print("VIF of features:")
for idx, col in enumerate(list(xgb_train.columns)):
    vif = variance_inflation_factor(xgb_train, idx)
    print(f" - {col}: {vif}")

VIF of features:
 - best_streak: 106.74902876646216
 - elo: 619.7301100220066
 - tier_A: 8720.378174434938
 - tier_B: 3654.1543862747926
 - tier_P: 116.51177862117734
 - tier_S: 6945.170636129256
 - tier_elo: 522.2910300458234
 - average_bet: 2.7142294094582726
 - total_matches: 34.1988813442017
 - win_rate: 97.29222154503046
 - best_streak_other: 72.6928905574653
 - elo_other: 567.3759339077448
 - tier_other_A: 11613.950372342177
 - tier_other_B: 4829.572595300563
 - tier_other_P: 201.9983163204563
 - tier_other_S: 9138.602158870646
 - tier_elo_other: 487.9278639117631
 - average_bet_other: 3.788268443798143
 - total_matches_other: 33.635069055043935
 - win_rate_other: 94.80084354237006
 - match_format_tournament: 1.2014679633560381
 - match_tier_A: 20254.11986359058
 - match_tier_B: 8377.164984412948
 - match_tier_P: 245.51660903598398
 - match_tier_S: 16186.688944485668


In [10]:
# define the parameter grid
params = {'max_depth': [1, 2, 3],
          'learning_rate': [0.01, 0.1, 1],
          'n_estimators': [100, 200, 300]}

# create an XGBoost classifier
clf = xgb.XGBClassifier()

# perform grid search using GridSearchCV
grid_search = GridSearchCV(estimator=clf, 
                           param_grid=params, 
                           cv=10, 
                           scoring='roc_auc'
#                            scoring='accuracy'
                          )
grid_search.fit(xgb_train, y_train)
print(f"[INFO] -- Fit model")

[INFO] -- Fit model


In [11]:
# print the best parameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# make predictions on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(xgb_test)
y_pred_probs = best_model.predict_proba(xgb_test)
accuracy = np.mean(y_pred == y_test)
print('Test accuracy:', accuracy)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Best score: 0.7961455512596414
Test accuracy: 0.7178568486945062


# CNN

In [12]:
cnn_columns = [
    # Home fighter
    'best_streak',
    'elo',
    'tier_A',
    'tier_B',
    'tier_P',
    'tier_S',
#     'tier_X', # Most common category
    'tier_elo',
    'average_bet',
    'total_matches',
    'win_rate',
    
    # Away fighter
    'best_streak_other',
    'elo_other',
    'tier_other_A',
    'tier_other_B',
    'tier_other_P',
    'tier_other_S',
#     'tier_other_X', # Most common category
    'tier_elo_other',
    'average_bet_other',
    'total_matches_other',
    'win_rate_other',
    
    # Match information
#     'match_format_matchmaking', # Most common category
    'match_format_tournament',
    'match_tier_A',
    'match_tier_B',
    'match_tier_P',
    'match_tier_S',
#     'match_tier_X', # Most common category
]

cnn_train = df_train[cnn_columns]
cnn_val = df_val[cnn_columns]
cnn_test = df_test[cnn_columns]

In [13]:
# # Define the model architecture
# model = Sequential()
# input_shape = cnn_train.shape[1:]
# model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Flatten())
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(2, activation='softmax'))

# # Compile the model
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Fit the model to the training data
# model.fit(cnn_train, y_train, batch_size=32, epochs=10, verbose=1, validation_data=(cnn_val, y_val))

# Make Predictions

#### Match data preparation functions

In [14]:
# Get data for the current match
def get_current_match_data():    
    # store the response of URL
    response_match = urlopen(match_url)

    # storing the JSON response from url in data
    match_data_json = json.loads(response_match.read())

    # Define columns
    columns_match_data = {
        'best_streak': 'best_streak', # 0
        'elo': 'elo', # 1
        'tier': 'tier', # 2
        'tier_elo': 'tier_elo', # 3
        'average_bet': 'average_bet', # 4
        'total_matches': 'total_matches', # 5
        'win_rate': 'win_rate', # 6
        'match_format': 'match_format', # 7
        'match_tier': 'tier', # 8
        'best_streak_other': 'best_streak', # 9
        'elo_other': 'elo', # 10
        'tier_other': 'tier', # 11
        'tier_elo_other': 'tier_elo', # 12
        'average_bet_other': 'average_bet', # 13
        'total_matches_other': 'total_matches', # 14
        'win_rate_other': 'win_rate', # 15
    }

    data_extraction_match = list(columns_match_data.values())

    # Init match_data
    match_data = pd.DataFrame(columns = columns_match_data)

    # Define lists for data extraction
    sides = ['fighter_blue_info', 'fighter_red_info']

    temp_data = [
        match_data_json['fighter_blue_info'][data_extraction_match[0]],
        match_data_json['fighter_blue_info'][data_extraction_match[1]],
        match_data_json['fighter_blue_info'][data_extraction_match[2]],
        match_data_json['fighter_blue_info'][data_extraction_match[3]],
        match_data_json['fighter_blue_info']['stats'][data_extraction_match[4]],
        match_data_json['fighter_blue_info']['stats'][data_extraction_match[5]],
        match_data_json['fighter_blue_info']['stats'][data_extraction_match[6]],
        match_data_json[data_extraction_match[7]],
        match_data_json[data_extraction_match[8]],
        match_data_json['fighter_red_info'][data_extraction_match[9]],
        match_data_json['fighter_red_info'][data_extraction_match[10]],
        match_data_json['fighter_red_info'][data_extraction_match[11]],
        match_data_json['fighter_red_info'][data_extraction_match[12]],
        match_data_json['fighter_red_info']['stats'][data_extraction_match[13]],
        match_data_json['fighter_red_info']['stats'][data_extraction_match[14]],
        match_data_json['fighter_red_info']['stats'][data_extraction_match[15]],
    ]

    return pd.concat([match_data, pd.DataFrame([temp_data], columns = list(columns_match_data.keys()))])

# process encode columns
def process_current_data(df):
    # Blue fighter data
    df['tier_A'] = 0
    df['tier_B'] = 0
    df['tier_P'] = 0
    df['tier_S'] = 0
    
    # Red fighter data
    df['tier_other_A'] = 0
    df['tier_other_B'] = 0
    df['tier_other_P'] = 0
    df['tier_other_S'] = 0
    
    # Match data
    df['match_format_tournament'] = 0
    df['match_tier_A'] = 0
    df['match_tier_B'] = 0
    df['match_tier_P'] = 0
    df['match_tier_S'] = 0
    
    # Add to make scaling easier
    df['winner_binary'] = 0
    
    # Set correct dummy variables
    if df.tier[0] != 'X':
        df[f"tier_{df.tier[0]}"] = 1
    if df.tier_other[0] != 'X':
        df[f"tier_other_{df.tier_other[0]}"] = 1
    if df.match_format[0] != 'matchmaking':
        df[f"match_format_{df.match_format[0]}"] = 1
    if df.match_tier[0] != 'X':
        df[f"match_tier_{df.match_tier[0]}"] = 1
        
    return df

#### Make current match prediction

In [None]:
while True:
    time.sleep(12)
    print('.')
    time.sleep(1)
    print('.')
    time.sleep(1)
    
    processed_current_match_data = process_current_data(get_current_match_data())[df_all.columns]
    std_current_match_data = pd.DataFrame(scaler.transform(processed_current_match_data), columns=df_all.columns)
    prediction = best_model.predict(std_current_match_data[xgb_columns])[0]
    probability = best_model.predict_proba(std_current_match_data[xgb_columns])[0]

    threashhold = max(probability)
    returns_calc = []
    for idx, probs in enumerate(y_pred_probs):
        home_prob = probs[0]
        away_prob = probs[1]
        ground_truth = y_test[idx]

        if (home_prob >= threashhold) and (ground_truth == 0):
            returns_calc.append(1)
        elif (home_prob >= threashhold) and (ground_truth == 1):
            returns_calc.append(0)
        elif (away_prob >= threashhold) and (ground_truth == 1):
            returns_calc.append(1)
        elif (away_prob >= threashhold) and (ground_truth == 0):
            returns_calc.append(0)

    chance_correct = float(np.mean(returns_calc))*100
    
    clear_output(wait=True)
    
    if prediction == 0:
        print(f"[INFO] -- Prediction: Blue")
        print(f"[INFO] -- Probability: {round(float(probability[0]*100), 3)}%")
        print(f"[INFO] -- Chance correct: {chance_correct}%")
    elif prediction == 1:
        print(f"[INFO] -- Prediction: Red")
        print(f"[INFO] -- Probability: {round(float(probability[1]*100), 3)}%")
        print(f"[INFO] -- Chance correct: {chance_correct}%")
        
    if threashhold > 0.9:
        mixer.music.play()
        time.sleep(15)

[INFO] -- Prediction: Blue
[INFO] -- Probability: 69.327%
[INFO] -- Chance correct: 81.48313602859058%


#### General Betting Rules of Thumb

"Probability" Threashholds:
- 50%-90%: 0
- 90%-95%: 5000
- 95%- on: Judgement
- 100%: All In YYYEEEEEHAWWWWWWW

# Calculating Returns

In [None]:
# split the dataframe into training, validation, and testing sets
df_train_calculations, df_test_calculations = train_test_split(df, test_size=test_train_split_percentage, 
                                                               random_state=notebook_seed)
df_train_calculations, df_val_calculations = train_test_split(df_train_calculations, test_size=train_validation_split_percentage, 
                                                              random_state=notebook_seed)

# Notes

- Interacting through a twitch channel requires registering your bot with twitch first, getting an Oauth token (personalized)...

- then connecting to the twitch IRC, sending keepalive messages, and then sending your message when you'd like..

- BUT you're looking to interact with saltybet's website itself...which my bot also does....and that requires beautifulsoup for python

- but yeah, it was just beautiful soup to parse the info from the SB website. To actually interact with it and place bets, all I needed was the "socket" module. Connect to the IRC thru the socket with your creds, then post messages through the socket. All in 1 module.

- socket.recv (to read twitch chat), and socket.send (to send messages(must be in proper format))