#### Elective 3 - ML Comparison

In [102]:
# load data
import json
import pandas as pd
import random
from hashlib import sha256
from typing import List

json_path = 'matches/simple_match_set.json'
with open(json_path, 'r') as file:
    data : List = json.load(file)

df = pd.DataFrame(data)

# function that 'flattens' an array (in this case the combination of heroes in a team)
# and returns a random float generated using a seed (an int of the hashed array)
def randfloat_from_arr(arr, min_val=0, max_val=1.0):
    arr_hash = sha256(str(arr).encode()).hexdigest()
    seed = int(arr_hash, 16)
    random.seed(seed)
    return random.uniform(min_val, max_val)

df['radiant_lineup'] = df['radiant_lineup'].apply(lambda x: randfloat_from_arr(x, 0, 100.0))
df['dire_lineup'] = df['dire_lineup'].apply(lambda x: randfloat_from_arr(x, 0, 100.0))

print(df.head())

X = df.drop(columns=['radiant_win'])
y = df['radiant_win']

   radiant_score  dire_score  radiant_lineup  dire_lineup  radiant_win
0             26          11       40.233268    33.977866         True
1              4          25       81.151517    59.099905        False
2             41          19       84.614570    97.955098         True
3              8          30       10.676171     4.121101        False
4             37           5        0.620122    85.992786         True


In [103]:
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# test accuracy
accuracy = model.score(X_test, y_test)
print(f"Random Forest Accuracy: {accuracy:.2f}")

Random Forest Accuracy: 0.93


In [104]:
# svm
from sklearn import svm
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
svm_X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(svm_X, y, test_size=0.2, random_state=42)

svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)

accuracy = svm_model.score(X_test, y_test)
print(f"SVM Accuracy: {accuracy:.2f}")

SVM Accuracy: 0.93


In [105]:
# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
print(f"Gradient Boosting Accuracy: {accuracy:.2f}")

Gradient Boosting Accuracy: 0.93


In [106]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.2f}")

print("Feature importances:", clf.feature_importances_)

Decision Tree Accuracy: 0.89
Feature importances: [0.42218932 0.47561357 0.04957533 0.05262178]
