In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Data viz
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


# Machine Learning
from sklearn.model_selection import train_test_split

# Classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor


np.warnings.filterwarnings('ignore')

# Arcade Game Stats Data Analysis / Blockbreaker

## Intro 

Blockbreaker is a classic arcade game with the object of the game being for the player to break as many blocks as they can without causing the ball to fall.

## About the data set

The author is in the process of creating a blockbreaker-like game, in which the jumping-off point is the "Block Breaker" section of the Udemy course, Complete C# Unity Developer 2D: Learn to Code Making Games

After making lots of levels, the author needed to sort them by difficulty. How does one measure the difficulty of a level? A first-cut solution is
to make an auto-play bot that is not perfect, and see how well the bot does on each level, using thousands of trials.

## Purpose of the notebook

The purpose of this notebook is to analyse the data provided by the game and extract some insights about the functionality of the game. What makes the game hard? is there a factor that might help a player win? can we predict the outcome of the game based on the data? valuable insight can be useful to make the game a challenging one but yet a fun one to play.

## Features of the dataset

- Date: date and time the game was auto-played
- Level: the name of the level (the 3-digit number is an estimate of the difficulty from a previous run, no longer valid after tweaking)
- NumBlocks: how many blocks have to be broken to win the level
- IsWin: True if autoplay broke all the blocks, False if the ball fell past the paddle.
- ElapsedTime: Seconds until either won or lost (game is played at 4x speed, so multiply by 4 to get an estimate of how long a human might play it)
- Score: total score when the game was won or lost
- Accuracy: the autoplay is tuned with a randomly-chosen accuracy. Higher numbers are more likely to win.

# 1. Data loading and data cleaning

In [None]:
gam = pd.read_csv('../input/arcade-game-stats/GameStats.csv')
display(gam)
print(gam.info())
print("\n", gam.describe())
print("\n", gam.shape)
sns.heatmap(gam.isnull())

from the above tables and graph we can start getting some insights on how we can start cleaning the data.

The most important thing to remark right now is that we don't have any null values.

Lets go feature by feature to clean it.

## 1.1 Date
Since is a bot which is playing the game and it doesn't learn, i doubt that this field could be meaningfull. Let's graph time vs score to see if the bot gets better or not. If the field doesn't show to be meaningfull is going to be dropped

In [None]:
date = gam.sort_values(by=['Date'])
sns.scatterplot(x='Date', y='Score', data=date)
plt.show()

As thought, the bot doesn't show any trend. Let's drop the 'Date' column

In [None]:
del gam['Date']

## 1.2 Level
The info function shows us that the "Level" feature is of type Object when it should be an ineger representing the level of difficulty

In [None]:
gam['Level']

let's remove that "Level_" part and change the data type to integer

In [None]:
gam['Level'] = gam['Level'].str.lstrip('Level_')
gam['Level'] = gam['Level'].astype(int)

## 1.3 NumBlocks & IsWin
Both features have their correct data type, int and bool respectively

## 1.4 ElapsedTime
Since the game was played at 4x speed, we will multiply this value by 4 to make it more human understandable

In [None]:
gam['ElapsedTime'] = gam['ElapsedTime']*4

## Score & Accuracy
Both features have their correct data type, int and float respectively

# 2. Desctriptive analysis
Having all the correct data types and cleaned data we can begin with our descriptive analysis.

In [None]:
gam.info()

# separating numbers and booleans
numbers = gam.select_dtypes(['float64', 'int64']).columns
booleans = gam.select_dtypes('bool').columns

## 2.1 Numbers

In [None]:
gam[numbers].hist(figsize=(20,10), edgecolor='black', color='aliceblue', bins=10)
plt.show()

def todf(column):
    df = pd.DataFrame(pd.cut(gam[column], bins=10).value_counts()).reset_index()
    df = df.rename(columns={'index': column + ' bins', column: column + ' count'})
    df = df.sort_values(by=[column + ' bins']).reset_index()
    return df

level = todf('Level')
numblocks = todf('NumBlocks')
ElapsedTime = todf('ElapsedTime')
score = todf('Score')
accuracy = todf('Accuracy')

display(pd.concat([level,numblocks, ElapsedTime, score, accuracy], axis=1))
display(gam[numbers].describe())

**Level**:
- The values are fairly distributed through the sample.
- Mean and median are really close together, which means that the distributions is not skewed.
- The distribution is fairly distributed so the interquantile range doesn't make any sense. In this case we can say that the distribution is best represented from the minimum value (378) to the maximum value (715)

**NumBlock**:
- The first five bins resemble a normal distribution, then all the next bins are empty until we find an outlier in the last bin with 400 samples.
- The mean is a bit higher than the median, which is the result of the right outlier pulling the distirbution to the right. The effect, though, is not significant
- Taking the outliers out, most of the values are found between 28 and 50 (IQR)

**ElapsedTime**
- Here we do have an outlier that distortionates significally the distribution, locating most of the values in the first bin (6809) leaving the middle bins empty until the last one where there are only 5 samples. **This five samples could be an error and are probably going to be dropped**
- We see how big the mean is in relation to the median, which further reflect the effect of the outlier
- Taking the outliers out, most of the values are located between 3.212832e+01 and 1.163324e+02	(IQR)

**Score**
- The distribution is skewed to the right, reflected by the mean being bigger than the median. Still, the values are not far apart and we can keep the outliers
- Taking extreme values out, most of the values are between 1150 and 3550

**Accuracy**
- Bit similar to ElapsedTime but the opposite, we have 16 samples in the first bin and the remaining in the last 3. The mean is lower than the median, which reflect this phenomenon. The so small scale of the values makes it hard to recognize the otliers with the mean and median, but stil is significant
- Taking extreme values out, most of values are located between 0.325245 and 0.374960

## 2.2 Boolean

In [None]:
sns.countplot('IsWin', data=gam[booleans], palette='Set3')
plt.show()

win = pd.DataFrame(gam[booleans].value_counts()).reset_index().rename(columns={0:'Count'})
win['%'] = np.around(win['Count']/win['Count'].sum() * 100, 2)
win

- We have balanced data! (fairly distributed) an excelent new for our analysis. This will make easier interpretation and model building

# 3. So what does it take to win?

let's find out

In [None]:
gam.groupby('IsWin')[numbers].agg(['mean', 'median'])

In [None]:
fig, ax = plt.subplots(2,3, figsize=(20,10))
sns.barplot(x='IsWin', y='Level', data=gam, ax=ax[0,0], palette='Set3', ci=None)
sns.barplot(x='IsWin', y='NumBlocks', data=gam, ax=ax[0,1], palette='Set3', ci=None)
sns.barplot(x='IsWin', y='ElapsedTime', data=gam, ax=ax[0,2], palette='Set3', ci=None)
sns.barplot(x='IsWin', y='Score', data=gam, ax=ax[1,0], palette='Set3', ci=None)
sns.barplot(x='IsWin', y='Accuracy', data=gam, ax=ax[1,1], palette='Set3', ci=None)

display(gam.groupby('IsWin')[numbers].agg(['mean', 'median']))


plt.show()

- We have here two interesting plots showing a proportion of 'True' being bigger than 'False': ElapsedTime and Score. But, if we recall section 2.1, we know how 'ElapsedTime is influenced by the outliers. Then we are going to set 'Score' as a the most determinant factor to Win, which makes sense in the context of gaming. Let's dig deeper here


In [None]:
fig, ax = plt.subplots(figsize=(20,8))

# Setting the score bins
gam['Score bins'] = pd.cut(gam['Score'], bins=20)

sns.countplot(x='Score bins', data=gam, hue='IsWin', ax=ax, palette='Set3')
order = gam['Score bins'].value_counts().sort_index().keys()
ax.set_xticklabels(order, rotation=90)

# Getting the win proportion
group = gam.groupby(['IsWin', 'Score bins']).size().reset_index()
groupf = group[group['IsWin']==False]
groupt = group[group['IsWin']==True]

df = pd.merge(groupf, groupt, on='Score bins', suffixes=[' False', ' True'])
df['Win Proportion %'] = np.around((df['0 True'] / (df['0 False'] + df['0 True'])) * 100, 2)
df = df.drop(['IsWin False', 'IsWin True'], axis=1)
df['Score bins'] = df['Score bins'].astype(str)

ax.plot(df['Score bins'], df['Win Proportion %'], label = 'Win proportion')
ax.set_xticklabels(order, rotation=90)
ax.legend()

plt.show()

display(gam.groupby(['IsWin', 'Score bins']).size())
display(df)

- We see in the graph how the bot losses as the score increases
- The proportion of wins gets bigger as the score increases.
- Something happens from score 5355 till score 11305, where, even though the score keeps increasing, there are 0 wins. **We'll check that in section 3.1**

## 3.1 what happens from score 5355?

In [None]:
display(gam[gam['Score']<5355].describe())
display(gam[gam['Score']>5355].describe())

- from the table we can conclude that is a matter of distribution, having just 200 samples from 5355 and onwards, this is the influence of outliers.

# 4. How do we get a high score?
So we know that we need a high score to win, but how can we get it?

In [None]:
sns.heatmap(gam[numbers].corr(), vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True)
plt.show()

If we focus our attention on "Score", we can extract two relevant relationships: Score-NumBlocs and Score-Accuracy

In [None]:
fig, ax = plt.subplots(1,2, figsize=(17,5))

sns.regplot(x='NumBlocks', y='Score', data=gam, ax=ax[0], marker=',', color='teal')
sns.regplot(x='Accuracy', y='Score', data=gam, ax=ax[1], marker=',', color='teal')

Let's take outliers out and use density hexagons onaccuracy to better represent the relationship

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

sns.regplot(x='NumBlocks', y='Score', data=gam[gam['NumBlocks']<100], ax=ax, color='teal')
sns.jointplot(x='Accuracy', y='Score', data=gam[gam['Accuracy']>0.15], kind='hex', gridsize=20)

- So we do see the relationship between the variables: the higher the accuracy and number of blocks, the higher the score.

# 5. Classification models: Can we predict wins?

Let's try different models and choose the best one

In [None]:
features = ['Level', 'NumBlocks', 'ElapsedTime', 'Score', 'Accuracy','IsWin']

gam_model = gam[features]
gam_model_X = gam_model.iloc[:,:5].values
gam_model_Y = gam_model.iloc[:,5].values

X_train, X_test, y_train, y_test = train_test_split(gam_model_X, gam_model_Y, random_state=0)

In [None]:
# knn

clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X_train, y_train)
print("training set score : {:.2f}".format(clf.score(X_train, y_train)))
print("test set score: {:.2f}".format(clf.score(X_test, y_test)))

In [None]:
# Logistic Regression

lr = LogisticRegression().fit(X_train, y_train)
print("training set score : {:.2f}".format(lr.score(X_train, y_train)))
print("test set score: {:.2f}".format(lr.score(X_test, y_test)))

In [None]:
# Linear SVC

svc = LinearSVC(C=150).fit(X_train, y_train)
print("training set score : {:.2f}".format(svc.score(X_train, y_train)))
print("test set score: {:.2f}".format(svc.score(X_test, y_test)))

In [None]:
# Decision Tree Classifier

tree = DecisionTreeClassifier(max_depth=11, random_state=0).fit(X_train, y_train)
print("training set score : {:.2f}".format(tree.score(X_train, y_train)))
print("test set score: {:.2f}".format(tree.score(X_test, y_test)))

print("feature importances:")
feature_importance = pd.DataFrame(gam_model.iloc[:,:5].keys(), tree.feature_importances_)
print(feature_importance.sort_index(ascending=False))

- The model that did the best work was KNN with a test accuracy of 0.99
- Decision tree also did a great job  with a test accuracy of 0.98. It also set 'Score' and 'NumBlocks' as the most important features with 0.622123 and 0.211470 respectively. Score importance in the model reflects the conclusions of the previous analysis

# 6. Regression models: Can we predict score?


In [None]:
features = ['Level', 'NumBlocks', 'ElapsedTime', 'Score', 'Accuracy']
gam_model[features]
gam_model_X = gam_model.loc[:,['Level', 'NumBlocks', 'ElapsedTime', 'Accuracy']].values
gam_model_Y = gam_model.loc[:,['Score']].values

X_train, X_test, y_train, y_test = train_test_split(gam_model_X, gam_model_Y, random_state=0)

In [None]:
# KNN Regressor

reg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)
print("training set score : {:.2f}".format(reg.score(X_train, y_train)))
print("test set score: {:.2f}".format(reg.score(X_test, y_test)))

In [None]:
# Linear regression: least squares

lr = LinearRegression().fit(X_train, y_train)
print("training set score : {:.2f}".format(lr.score(X_train, y_train)))
print("test set score: {:.2f}".format(lr.score(X_test, y_test)))

In [None]:
# Ridge

ridge = Ridge().fit(X_train, y_train)
print("training set score : {:.2f}".format(ridge.score(X_train, y_train)))
print("test set score: {:.2f}".format(ridge.score(X_test, y_test)))

In [None]:
# Lasso

lasso = Lasso().fit(X_train, y_train)

print("training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

In [None]:
tree = DecisionTreeRegressor(max_depth=6, random_state=0).fit(X_train, y_train)
print("training set score : {:.2f}".format(tree.score(X_train, y_train)))
print("test set score: {:.2f}".format(tree.score(X_test, y_test)))

print("feature importances:")
feature_importance = pd.DataFrame(['Level', 'NumBlocks', 'ElapsedTime', 'Accuracy'], tree.feature_importances_)
print(feature_importance.sort_index(ascending=False))

- KNN did the best job in this section again with a test score of 0.93
- Decision tree Regressor also did a good job with a test score of 0.91. The model gives elapsedtime and Numlocks as the most import features in the model