In [21]:
import pandas as pd
import numpy as np

%matplotlib inline
import pylab as plt
plt.style.use('fivethirtyeight')

import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')

## Reading data

In [22]:
data = pd.read_csv('./data/EF_battles_corrected.csv', parse_dates=['start', 'end'])

In [23]:
data['end_num'] = (data['end'].dt.year -1938) * 12 +  data['end'].dt.month
data['start_num'] = (data['start'].dt.year -1938) * 12 +  data['start'].dt.month
data['duration'] = (data['end_num'] - data['start_num']).clip(lower=1)

### Data types and missing values

In [24]:
data.dtypes

name                                 object
url                                  object
allies killed                       float64
axis killed                         float64
allies_destroyed_tanks                int64
allies_destroyed_airplane             int64
allies_destroyed_guns                 int64
axis_destroyed_tanks                  int64
axis_destroyed_airplane               int64
axis_destroyed_guns                   int64
belligerents.allies                  object
belligerents.axis                    object
axis_leaders                         object
allies_leaders                       object
result_prop                         float64
result                               object
axis_planes                         float64
axis_guns                           float64
axis_tanks                          float64
axis_infantry                       float64
allies_planes                       float64
allies_guns                         float64
allies_tanks                    

In [25]:
cols = [
    'allies_infantry', 'axis_infantry',
    'allies_tanks', 'axis_tanks',
    'allies_guns', 'axis_guns'
]


In [26]:
mask = data[cols].notnull().all(1)

In [27]:
data_kmeans = data.loc[mask, cols]

## 1. Unsupervised learning: K-means

In [28]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=5)

In [29]:
labels = model.fit_predict(data_kmeans) + 1  # for naming

In [30]:
print(labels + 1)

[5 3 5 6 3 4 2 5 2 2 5 5 4 5 3 2 5 2 3]


In [31]:
data_kmeans['label'] = ('Cluster ' + pd.Series((labels+1)).astype(str)).values
data_kmeans[['name', 'result', 'start']] = data.loc[mask, ['name', 'result', 'start']]

## Visualise

In [32]:
import altair as alt
# alt.renderers.enable('notebook')


In [33]:
c = alt.Chart(data_kmeans).mark_point().encode(
    color=alt.Color('label:N', legend=alt.Legend(title='Cluster')),
    x='allies_infantry', y='axis_infantry', shape='result',
    tooltip=data_kmeans.columns.tolist()).interactive()

c

<VegaLite 3 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Scale

In [34]:
from sklearn.preprocessing import scale

In [35]:
data_scaled = scale(data_kmeans.drop(['label', 'name', 'start', 'result'], axis=1))

In [36]:
labels_scaled = model.fit_predict(data_scaled) + 1

In [37]:
# data_scaled['label'] = ('Cluster ' + pd.Series((labels_scaled +1)).astype(str)).values
# data_scaled[['name', 'result', 'start']] = data.loc[~mask, ['name', 'result', 'start']]

data_kmeans['label 2'] = ('Cluster ' + pd.Series((labels_scaled +1)).astype(str)).values

In [38]:
c.data = data_kmeans

In [39]:
c.encode(color=alt.Color('label 2:N', legend=alt.Legend(title='Cluster')))

<VegaLite 3 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [40]:
data

Unnamed: 0,name,url,allies killed,axis killed,allies_destroyed_tanks,allies_destroyed_airplane,allies_destroyed_guns,axis_destroyed_tanks,axis_destroyed_airplane,axis_destroyed_guns,...,allies_tanks,allies_infantry,level,parent,latlon,start,end,end_num,start_num,duration
0,Battle of Westerplatte,https://en.wikipedia.org/wiki/Battle_of_Wester...,21.0,400.0,0,0,0,0,0,0,...,,240.0,100,German Invasion of Poland,"54.4075,18.67139",1939-09-01,1939-09-01,21,21,1
1,Battle of Mokra,https://en.wikipedia.org/wiki/Battle_of_Mokra,500.0,800.0,1,0,0,50,0,0,...,,,100,German Invasion of Poland,"50.964643,18.917042",1939-09-01,1939-09-01,21,21,1
2,Battle of Mlawa,https://en.wikipedia.org/wiki/Battle_of_Mlawa,1200.0,1800.0,0,0,0,72,0,0,...,,,100,German Invasion of Poland,"53.114139,20.383107",1939-09-01,1939-09-01,21,21,1
3,Battle of Tuchola Forest,https://en.wikipedia.org/wiki/Battle_of_Tuchol...,1600.0,506.0,0,0,0,0,0,0,...,,,100,German Invasion of Poland,"53.664733,18.246738",1939-09-01,1939-09-01,21,21,1
4,Battle of Jordanów,https://en.wikipedia.org/wiki/Battle_of_Jordan...,0.0,0.0,3,0,0,70,0,0,...,58.0,50000.0,100,German Invasion of Poland,"49.650579,19.832310",1939-09-01,1939-09-01,21,21,1
5,Battle of Borowa Góra,https://en.wikipedia.org/wiki/Battle_of_Borowa...,663.0,650.0,0,0,0,0,0,0,...,,2000.0,100,German Invasion of Poland,"51.368929,19.353395",1939-09-01,1939-09-01,21,21,1
6,Battle of Wizna,https://en.wikipedia.org/wiki/Battle_of_Wizna,660.0,900.0,0,0,0,10,1,0,...,,720.0,100,German Invasion of Poland,"53.196206,22.381763",1939-09-01,1939-09-01,21,21,1
7,Battle of Piotrków Trybunalski,https://en.wikipedia.org/wiki/Battle_of_Piotrk...,60.0,60.0,2,0,0,17,0,2,...,,,100,German Invasion of Poland,"51.404803,19.698756",1939-09-01,1939-09-01,21,21,1
8,Battle of Hel,https://en.wikipedia.org/wiki/Battle_of_Hel,,,0,0,0,0,53,0,...,,2800.0,100,German Invasion of Poland,"54.625479,18.791930",1939-09-01,1939-10-01,22,21,1
9,Battle of the Bzura,https://en.wikipedia.org/wiki/Battle_of_the_Bzura,20000.0,8000.0,0,0,0,50,0,20,...,,225000.0,100,German Invasion of Poland,"52.23333,19.36667",1939-09-01,1939-09-01,21,21,1
