In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
print("TensorFlow v" + tf.__version__)

In [2]:
data = pd.read_csv('starcraft_player_data.csv')
print("Full train dataset shape is {}".format(data.shape))

Full train dataset shape is (3395, 20)


#### The data is composed of 20 columns and 3395 entries. We can see all 20 dimensions of the dataset by printing the first 5 entries using the following code:

In [11]:
# display first 5 examples
data.head(5)

Unnamed: 0,GameID,LeagueIndex,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,...,TotalHours_936,TotalHours_94,TotalHours_95,TotalHours_950,TotalHours_96,TotalHours_960,TotalHours_980,TotalHours_990,TotalHours_999,TotalHours_?
0,-1.747724,0.537805,0.513521,-0.148288,-0.684207,1.116679,0.069594,0.012948,1.397121,-0.448594,...,0,0,0,0,0,0,0,0,0,0
1,-1.746621,0.537805,0.234613,-0.188288,-0.507399,-0.154515,1.180306,0.11938,0.850859,-0.433919,...,0,0,0,0,0,0,0,0,0,0
2,-1.746253,-0.121346,-0.906582,-0.605182,-0.168988,-0.154515,1.177699,0.196244,-0.541384,0.249894,...,0,0,0,0,0,0,0,0,0,0
3,-1.745885,-0.780497,-0.181859,-0.617967,-0.713535,-1.42571,-0.271524,0.41378,0.322194,-0.649597,...,0,0,0,0,0,0,0,0,0,0
4,-1.745518,-0.780497,0.112517,-0.598573,-0.205646,-1.001978,-0.592757,2.496707,-1.103252,-1.030437,...,0,0,0,0,0,0,0,0,0,0


In [13]:
data.info()
data.describe(include='all').T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3395 entries, 0 to 3394
Columns: 318 entries, GameID to TotalHours_?
dtypes: float64(17), int64(1), uint8(300)
memory usage: 1.4 MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GameID,3395.0,-7.418056e-16,1.000147,-1.747724,-0.860627,0.025367,0.847012,1.945174
LeagueIndex,3395.0,-8.656796e-16,1.000147,-2.098798,-0.780497,-0.121346,0.537805,2.515258
APM,3395.0,4.839853e-17,1.000147,-1.828873,-0.715218,-0.173992,0.495661,5.252153
SelectByHotkeys,3395.0,5.729339e-17,1.000147,-0.813578,-0.575519,-0.340513,0.157829,7.341461
AssignToHotkeys,3395.0,2.376106e-16,1.000147,-1.661072,-0.753001,-0.093410,0.556950,6.129886
...,...,...,...,...,...,...,...,...
TotalHours_960,3395.0,8.836524e-04,0.029718,0.000000,0.000000,0.000000,0.000000,1.000000
TotalHours_980,3395.0,2.945508e-04,0.017162,0.000000,0.000000,0.000000,0.000000,1.000000
TotalHours_990,3395.0,2.945508e-04,0.017162,0.000000,0.000000,0.000000,0.000000,1.000000
TotalHours_999,3395.0,2.945508e-04,0.017162,0.000000,0.000000,0.000000,0.000000,1.000000


In [3]:
# separate numerical and categorical variables for easy analysis
cat_cols = data.select_dtypes(include=['object']).columns
num_cols = data.select_dtypes(include=np.number).columns.tolist()

print("Categorical values: ")
print(cat_cols)
print("Numerical values: ")
print(num_cols)

Categorical values: 
Index(['Age', 'HoursPerWeek', 'TotalHours'], dtype='object')
Numerical values: 
['GameID', 'LeagueIndex', 'APM', 'SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks', 'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency', 'ActionsInPAC', 'TotalMapExplored', 'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']


In [4]:
# default performance = 0
performance_values = 0

# add the 'performance' column to data
data['PlayerPerformance'] = performance_values
data.to_csv('your_dataset.csv', index=False)

data.head()

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,...,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed,PlayerPerformance
0,52,5,27,10,3000,143.718,0.003515,0.00022,7,0.00011,...,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.0,0.0,0
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,...,0.004307,32.9194,42.3454,4.8434,22,0.001194,5,0.0,0.000208,0
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,...,0.002926,44.6475,75.3548,4.043,22,0.000745,6,0.0,0.000189,0
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,5.3e-05,...,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.0,0.000384,0
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.0,...,0.002368,22.6885,62.0813,9.374,15,0.001174,4,0.0,1.9e-05,0


In [5]:
# separate features and target variable
categorical_cols = ['Age', 'HoursPerWeek', 'TotalHours']
numerical_cols = ['GameID', 'LeagueIndex', 'APM', 'SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks', 'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency', 'ActionsInPAC', 'TotalMapExplored', 'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']
target_col = 'PlayerPerformance'

# convert categorical variables to encoding
data = pd.get_dummies(data, columns=categorical_cols)

# scale numerical variables
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# split the data into training and testing sets
x = data.drop(target_col, axis=1).values
y = data[target_col].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(x_train.shape[1], 1)),
    tf.keras.layers.Dense(1)
])

model.compile(loss='mean_squared_error', optimizer='adam')

In [7]:
x_train_rnn = np.expand_dims(x_train, axis=2)  # reshape input

model.fit(x_train_rnn, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7c105a9e20>

In [8]:
x_test_rnn = np.expand_dims(x_test, axis=2)  # reshape input for RNN

predictions = model.predict(x_test_rnn)
mse = np.mean((predictions - y_test) ** 2)
print("Mean Squared Error:", mse)

Mean Squared Error: 2.6291605811461843e-08


In [9]:
# define RNN model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(x_train.shape[1], 1)),
    tf.keras.layers.Dense(1)
])

# plot the model
plot_model(model, to_file='model_rnn.png', show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


#### Hypothetical: the model would be better if we had the rank of the players from previous years / competitions. The first test would train the neural map using the previous performance of the player and then predict their 2023 ranking, looking only one game into the future at a time, meaning the map would benefit from all real data up to the game in question. The graph would then show the predictions compared to the real data. The two graph lines shouldn't match up exactly because that would mean the RNN memorized the training data rather than create a prediction. 