In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
print("TensorFlow v" + tf.__version__)

TensorFlow v2.12.0


In [3]:
data = pd.read_csv('starcraft_player_data.csv')
print("Full train dataset shape is {}".format(data.shape))

Full train dataset shape is (3395, 20)


#### The data is composed of 20 columns and 3395 entries. We can see all 20 dimensions of the dataset by printing the first 5 entries using the following code:

In [4]:
# display first 5 examples
data.head(5)

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.718,0.003515,0.00022,7,0.00011,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.0,0.0
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001194,5,0.0,0.000208
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,4.043,22,0.000745,6,0.0,0.000189
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,5.3e-05,0.000543,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.0,0.000384
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.0,0.001329,0.002368,22.6885,62.0813,9.374,15,0.001174,4,0.0,1.9e-05


In [5]:
data.info()
data.describe(include='all').T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3395 entries, 0 to 3394
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   GameID                3395 non-null   int64  
 1   LeagueIndex           3395 non-null   int64  
 2   Age                   3395 non-null   object 
 3   HoursPerWeek          3395 non-null   object 
 4   TotalHours            3395 non-null   object 
 5   APM                   3395 non-null   float64
 6   SelectByHotkeys       3395 non-null   float64
 7   AssignToHotkeys       3395 non-null   float64
 8   UniqueHotkeys         3395 non-null   int64  
 9   MinimapAttacks        3395 non-null   float64
 10  MinimapRightClicks    3395 non-null   float64
 11  NumberOfPACs          3395 non-null   float64
 12  GapBetweenPACs        3395 non-null   float64
 13  ActionLatency         3395 non-null   float64
 14  ActionsInPAC          3395 non-null   float64
 15  TotalMapExplored     

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
GameID,3395,,,,4805.01,2719.94,52.0,2464.5,4874.0,7108.5,10095.0
LeagueIndex,3395,,,,4.18409,1.51733,1.0,3.0,4.0,5.0,8.0
Age,3395,29.0,20.0,357.0,,,,,,,
HoursPerWeek,3395,33.0,10.0,411.0,,,,,,,
TotalHours,3395,238.0,500.0,328.0,,,,,,,
APM,3395,,,,117.047,51.9453,22.0596,79.9002,108.01,142.79,389.831
SelectByHotkeys,3395,,,,0.00429866,0.00528443,0.0,0.00125782,0.00249951,0.00513257,0.0430884
AssignToHotkeys,3395,,,,0.000373576,0.000224934,0.0,0.000204226,0.000352568,0.000498834,0.00175219
UniqueHotkeys,3395,,,,4.36465,2.36033,0.0,3.0,4.0,6.0,10.0
MinimapAttacks,3395,,,,9.83067e-05,0.000165871,0.0,0.0,3.99297e-05,0.000118854,0.00301935


In [6]:
# separate numerical and categorical variables for easy analysis
cat_cols = data.select_dtypes(include=['object']).columns
num_cols = data.select_dtypes(include=np.number).columns.tolist()

print("Categorical values: ")
print(cat_cols)
print("Numerical values: ")
print(num_cols)

Categorical values: 
Index(['Age', 'HoursPerWeek', 'TotalHours'], dtype='object')
Numerical values: 
['GameID', 'LeagueIndex', 'APM', 'SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks', 'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency', 'ActionsInPAC', 'TotalMapExplored', 'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']


In [7]:
# default performance = 0
performance_values = 0

# add the 'performance' column to data
data['PlayerPerformance'] = performance_values
data.to_csv('your_dataset.csv', index=False)

data.head()

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,...,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed,PlayerPerformance
0,52,5,27,10,3000,143.718,0.003515,0.00022,7,0.00011,...,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.0,0.0,0
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,...,0.004307,32.9194,42.3454,4.8434,22,0.001194,5,0.0,0.000208,0
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,...,0.002926,44.6475,75.3548,4.043,22,0.000745,6,0.0,0.000189,0
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,5.3e-05,...,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.0,0.000384,0
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.0,...,0.002368,22.6885,62.0813,9.374,15,0.001174,4,0.0,1.9e-05,0


In [8]:
# separate features and target variable
categorical_cols = ['Age', 'HoursPerWeek', 'TotalHours']
numerical_cols = ['GameID', 'LeagueIndex', 'APM', 'SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks', 'MinimapRightClicks', 'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency', 'ActionsInPAC', 'TotalMapExplored', 'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']
target_col = 'PlayerPerformance'

# convert categorical variables to encoding
data = pd.get_dummies(data, columns=categorical_cols)

# scale numerical variables
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# split the data into training and testing sets
x = data.drop(target_col, axis=1).values
y = data[target_col].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(x_train.shape[1], 1)),
    tf.keras.layers.Dense(1)
])

model.compile(loss='mean_squared_error', optimizer='adam')

In [10]:
x_train_rnn = np.expand_dims(x_train, axis=2)  # reshape input

model.fit(x_train_rnn, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fed0bea9c40>

In [11]:
x_test_rnn = np.expand_dims(x_test, axis=2)  # reshape input for RNN

predictions = model.predict(x_test_rnn)
mse = np.mean((predictions - y_test) ** 2)
print("Mean Squared Error:", mse)

Mean Squared Error: 5.463003204584331e-10


In [None]:
# define RNN model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(x_train.shape[1], 1)),
    tf.keras.layers.Dense(1)
])

# plot the model
plot_model(model, to_file='model_rnn.png', show_shapes=True)

#### Hypothetical: the model would be better if we had the rank of the players from previous years / competitions. The first test would train the neural map using the previous performance of the player and then predict their 2023 ranking, looking only one game into the future at a time, meaning the map would benefit from all real data up to the game in question. The graph would then show the predictions compared to the real data. The two graph lines shouldn't match up exactly because that would mean the RNN memorized the training data rather than create a prediction. 