In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
from datetime import datetime
#data visualization
import seaborn as sns 
import plotly.graph_objects as go
import plotly.express as px
#statistical tests
from scipy import stats as ss
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Do people like to play as their home country in videogames? We make a case study on how the nation of the player influences in-game choices by analysing [105000 Age of Empires 2 1v1 Ranked Random Matches](https://www.kaggle.com/slappdun/35000-age-of-empires-2-1v1-ranked-random-matches) played in March 2021.

For the readers who do not know, Age of Empires 2 is a Real Time strategy game set in the medieval period where the player can choose to play as one of 37 different historical civilizations, from Aztecs to Vikings (in alphabetic order). The choice of the civilization to play is not purely cosmetic, but affects significantly the gameplay, with every one having a different roster of units available, plus specific in-game bonuses, like for example Aztecs cannot train cavalry units, but have melee infantry with extra attack.

Normally the choice of the civilization to play is thus an in-game strategic choice, based on your own playstyle, the settings of the game and the kind of civilization you expect the opponent to pick. We see if it is a common thing for players to decide to choose instead to play the civilization that live on their homeland, and how this affects their success in the game. 

## First look at the database

We check the entries of the database. It has $105624$ rows, each corresponding to a match and $23$ columns, with quite self-explanatory names. We note that some data are missing, in particular, for the study we are interested in, there are $6784$ matches for which we don't know the nationality of the winner and $7211$ for which we miss the nationality of the loser. We note that data about the civilizations played are complete.

In [None]:
df_all=pd.read_csv('/kaggle/input/35000-age-of-empires-2-1v1-ranked-random-matches/rankedrm.csv')
df_all.info()

We take a look at the first rows in the database. What we see is that the countries are represented in the database as their  [ISO 3166-1 alpha-2 code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2), a two digit code created by the International Organization for Standardization to uniquely identify them. Moreover, we see that there is a decent amount of redundant information, like the civilizations and the maps are identified both by their names and by a numeric code and the column 'matchup' just repeats the two civlizations.

In [None]:
df_all.head()

We check exactly what is the timespan covered by the database. We locate the first and the last match. The times of the matches are saved as strings of text, so to safely compute first and last matches to be played (by starting time) we have to convert the columns `started` and `finished` to datetime. We see that all matches were played in the first two weeks of March 2021.

In [None]:
for i in df_all.index:
    df_all.loc[i,'started.datetime']=datetime.strptime(df_all.loc[i,'started'],'%Y-%m-%d %H:%M:%S')
    df_all.loc[i,'finished.datetime']=datetime.strptime(df_all.loc[i,'finished'],'%Y-%m-%d %H:%M:%S')
first_start=df_all['started.datetime'].min()
first_end=df_all['finished.datetime'].loc[df_all['started.datetime'].argmin()]
last_start=df_all['started.datetime'].max()
last_end=df_all['finished.datetime'].loc[df_all['started.datetime'].argmax()]

print("First game in the database started on ",first_start.date(),'at',first_start.time(),' and finished on', first_end.date(),'at',first_end.time(),)
print("Last game in the database started on",last_start.date(),'at',last_start.time(),  'and finished on', last_end.date(),'at',last_end.time(),)



We look at all the nations from which we had a player participating in any of the matches in the database. There is a total of $127$ countries represented. Also, all of the $37$ civilizations (civs in the slang of the game, used also in the database column names) present in the game has been played at least once in the matches considered in the database. 

In [None]:
countries=pd.concat([df_all['country.win'],df_all['country.lose']])
print(countries.unique(), " , ", len(countries.unique())-1)
civs=pd.concat([df_all['civ.win.name'],df_all['civ.lose.name']])
print(civs.unique(), ',',len(civs.unique()))

## Data about countries and civilizations

We create a dictionary that maps modern countries to the civilization that represents them most closely in the historical setting of the game. We do this for the civilisations that from their name and in-game description clearly correspond to a modern nation. There are many other less-obvious correspondences we will describe later, but we do not want this to be a controversial discussion on who is the legitimate heir of a specific empire.

In [None]:
civ_dic={'IT':'Italians','FR':'Franks','BG':'Bulgarians','LT':'Lithuanians','TR':'Turks','VN':'Vietnamese', 'JP':'Japanese','ES':'Spanish','CN':'Chinese','PT':'Portuguese','DE':'Teutons','RU':'Slavs','MY':'Malay','HU':'Magyars','KR':'Koreans','ET':'Ethiopians','MN':'Mongols','IR':'Persians','IN':'Indians','KH':'Khmer','ML':'Malians','MM':'Burmese','GB':'Britons','PE':'Incas'}

Despite the fact that the game is designed around making the civilizations as balanced as possible across all the various game settings, we see that there is a significant discrepancy in the frequency with which they are played. Franks are by far the most common being played $20106$ times, more than $10$ times more common than the least picked civilization, Burmese (played $1979$ times), as shown in the following pie chart.

In [None]:
civ_matches=civs.value_counts()
fig = go.Figure()
fig.add_trace(go.Pie(labels= civ_matches.index, values = civ_matches,showlegend = True, name ='quantities'))
fig.update_layout(title_text = 'Players count by civilization played')
fig.show()


We also plot the $10$ nations with the highest number of matches played. Unfortunately, US and Argentina, first and third, have no civilisation in-game that was based there. 

In [None]:
nation_matches=countries.value_counts().head(10)
fig = go.Figure()
fig.add_trace(go.Bar(x= nation_matches.index, y = nation_matches,showlegend = True
                     , name ='Matches played',marker_color = 'green'))
fig.update_layout(title_text = 'Players count by nation')
fig.show()

## Civilisations national and global playing frequncies

The following code extracts from the database the data we are the most interested in. We write it as the function `extract nation` so that it can be run on any subset of the database, in case we want to apply a filter to the matches, like we will do later by skill level. It takes as imput a portion of the database and the country-civilisation dictionary and for each pair computes several informations we need for our following analysis and saves them into a new database. We exclude the nations where less than 100 total games have been played, as that would not be a large enough sample size to draw any conclusion. Unfortunately this means exluding Iran, Mongolia, Malaysia, Myanmar, Cambodia, Ethiopia and Mali, the last two of which had nobody playing ranked AoE2 in the period considered. 
 
We also run the process on the entire database first and show the first rows of the output database:


In [None]:
def extract_nation (df,civ_dict):
    df_n=pd.DataFrame()
    for nation in civ_dict.keys():
        nw=df['country.win'].loc[df['country.win']==nation].count()
        nl=df['country.lose'].loc[df['country.lose']==nation].count()
        #print(nw,nl)
        #nw=A['country.win'].count()
        #nl=B['country.lose'].count()
        if (nw+nl)>=100:
            yw=df['country.win'].loc[(df['civ.win.name']==civ_dict[nation])&(df['country.win']==nation)].count()
            yl=df['country.lose'].loc[(df['civ.lose.name']==civ_dict[nation])&(df['country.lose']==nation)].count()
            frq=None
            if nw+nl>0:
                frq=(yw+yl)/(nw+nl)
            win_ratio=None
            if yw+yl>0:
                win_ratio=yw/(yw+yl)
            df_n.loc[nation,'Civ']=civ_dict[nation]
            df_n.loc[nation,['National Civ Matches','National Civ Wins','National Frequency','National Win %','National Total Matches' ]]=[yw+yl,yw,frq,win_ratio,nw+nl]
        else:
            print(nation + ' has only ' + (nw+nl).astype(str) + ' total matches played')
    total=df['country.win'].count()+df['country.lose'].count()
    for nation in df_n.index:
        yw=df['country.win'].loc[(df['civ.win.name']==civ_dict[nation])&(df['country.win'].notna())].count()
        nw=df['country.win'].count()
        yl=df['country.lose'].loc[(df['civ.lose.name']==civ_dict[nation])&(df['country.lose'].notna())].count()
        nl=df['country.lose'].count()
        frq=None
        if nw+nl>0:
            frq=(yw+yl)/(nw+nl)
        win_ratio=None
        if yw+yl>0:
            win_ratio=yw/(yw+yl)
        df_n.loc[nation,['Global Civ Matches','Global Civ Wins','Global Frequency','Global Win %']]=[yw+yl,yw,frq,win_ratio]
        #print(nation,frq,win_ratio,nw+nl)
    return(df_n,total)

df_nation, tot=extract_nation (df_all,civ_dic)
df_nation.head()

We can now finally plot the frequency with which each civilisation is played in its home country and globally. We find that indeed players are much more likely to choose their home civ, the national pick rate is always higher than the global one. The largest difference beween national and global frequency is seen in Lithuania ($+13.6\%$) followed by Bulgaria ($+8.8\%$), these could be outliers due to the relatively small sample sizes, but we can also imagine that as these countries are rarely prominently featured in videogames or in general in media with global diffusion, it might be very exciting for Lithuanians and Bulgarians to be finally able to play as their own people.

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x = df_nation.loc[:,'Civ']+', '+df_nation.index, y = df_nation['National Frequency'], name = 'National' ))
fig.add_trace(go.Bar(x = df_nation.loc[:,'Civ']+', '+df_nation.index, y = df_nation['Global Frequency'], name='Global'))
fig.update_layout(barmode='group',title_text = 'Civilization play rate nationally and globally')
fig.show()

The results look quite evident from the histogram, but to be safe we run a contingency chi-square test for each nation to to confirm that the national frequency of the home civs is different (in this case higher) from the frequency of the civilisation elsewhere by a statistically significant margin. We see that almost everywhere the result is significant by a huge margin (South Korea the main exception, with Japan and India close to be significant, but not quite enough).

In [None]:
def chisquare_nation (df, tot):
    for nation in df.index:
        A=df.loc[nation,'National Civ Matches']
        B=df.loc[nation,'National Total Matches']-df.loc[nation,'National Civ Matches']
        C=df.loc[nation,'Global Civ Matches']
        D=tot-df.loc[nation,'Global Civ Matches']
        tb=np.array([[A,B],[C,D]])
        print(nation,"Chi-square test p-value:",ss.chi2_contingency(tb) [1], ', statistic:',ss.chi2_contingency(tb) [0])

chisquare_nation(df_nation, tot)

We also check if the propensity for players to use the local civ affects the win rate of said civ. We find that there is no significant effect either way, we again run a chi-square test to make sure that the differences (which appear in both directions) are not significant, with the only exception of the Portuguese who seem to be particularly good at playing their home civ.
This is a bit surprising, as we mentioned how picking a civ based on your own nationality is definitely not a gameplay-based strategy and thus we would expect such players to have worse results than the ones who actually pick the civ for an in-game reason. It could be the case that they get more experience to play the specific civ or that in general most players are not really able to get the best out of every civ no matter why they had chosen it.

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x = df_nation.loc[:,'Civ']+', '+df_nation.index, y = df_nation['National Win %'], name = 'National' ))
fig.add_trace(go.Bar(x = df_nation.loc[:,'Civ']+', '+df_nation.index, y = df_nation['Global Win %'], name='Global'))
fig.update_layout(barmode='group',title_text = 'Civilization win rate nationally and globally')
fig.show()

def chisquare_win (df,total):
    for nation in df.index:
        A=df.loc[nation,'National Civ Wins']
        B=df.loc[nation,'National Civ Matches']-df.loc[nation,'National Civ Wins']
        C=df.loc[nation,'Global Civ Wins']
        D=df.loc[nation,'Global Civ Matches']-df.loc[nation,'Global Civ Wins']
        
        tb=np.array([[A,B],[C,D]])
        print(nation,"Chi-square test p-value:",ss.chi2_contingency(tb) [1], ', statistic:',ss.chi2_contingency(tb) [0] )

chisquare_win(df_nation, tot)

## Divisions by level of the matches

As we said in the introduction, choosing to play your home civilisation instead of making a decision based on in-game reasons or going random is definitely not something done with hard core competition in mind. We thus would expect that this would be more frequent among casual players than among the more dedicated ones, which would like both to learn how to play all civilisations and to make decisions more based on the metagame. 

Skill level in AoE 2 is measured mainly by an [Elo type](https://www.youtube.com/watch?v=GTaAWtuLHuo) rating, there are no official leagues distinctions like in other competitive games, so we will divide arbitrarily the matches based on the average rating of the players involved.

Here is the the distribution of average rating of the matches, in the further analysis we will consider matches below $1000$ as low Elo, between $1000$ and $1500$ as mid Elo and above $1500$ as high Elo.

In [None]:
df_all.loc[:,'rank.avg']=(df_all.loc[:,'rating.win']+df_all.loc[:,'rating.lose'])/2
#print(df_all['rank.avg'].count())
#print(df_all['match_uuid'].loc[(df_all['rating.win'].notna())&(df_all['rating.lose'].notna())].count())

fig = go.Figure()
fig.add_trace(go.Histogram(x = df_all['rank.avg']))
fig.add_vline(x=1000)
fig.add_vline(x=1500)
fig.update_layout(barmode='group',title_text = 'Distribution of the ranking across matches')
fig.show()


We run again the function `extract_nation` now on the different elements of the partition of the database by Elo rating to see if the behaviour is significantly different across different skill levels. We note that more nations than the original $7$ now do not reach in some categories the threshold of $100$ played matches to be considered. This is obviously more common in the high-Elo group which has less matches than the others. 

In [None]:
df_ranked=df_all.dropna(subset=['rank.avg'])

df_low=df_ranked.loc[df_ranked['rank.avg']<1000]
print('Number of low-Elo matches= '+ str(len(df_low.index)))
df_nation_low, tot_low=extract_nation(df_low,civ_dic)

df_mid=df_ranked.loc[(1000<=df_ranked['rank.avg']) & (df_ranked['rank.avg']<1500)]
print('Number of mid-Elo matches= '+ str(len(df_mid.index)))
df_nation_mid, tot_mid=extract_nation(df_mid,civ_dic)

df_high=df_ranked.loc[df_ranked['rank.avg']>=1500]
print('Number of high-Elo matches= '+ str(len(df_high.index)))
df_nation_high, tot_high=extract_nation(df_high,civ_dic)


We start analysing low-Elo matches. Indeed at low elo all countries still play their home civilisation at least at the same rate as the global one. Again, except Korea, Japan and maybe India, all the other tests show a difference between the national and global frequency that is very statistically significant.

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x = df_nation_low['Civ']+', '+df_nation_low.index, y = df_nation_low['National Frequency'], name = 'National' ))
fig.add_trace(go.Bar(x = df_nation_low['Civ']+', '+df_nation_low.index, y = df_nation_low['Global Frequency'], name='Global'))
fig.update_layout(barmode='group',title_text = 'Civilization frequency by nation of the player (Elo<1000)')
fig.show()
chisquare_nation(df_nation_low,tot_low)

At mid-Elo level the situation starts to change. We see many less significant results for the chi square test and we even have Korean and Peruvian players using their home civ less than the global average. It is worth nothing that French players are the second most likely to play their home civ after Bulgarians, but only because Franks are by far the most picked civ globally, the difference between the two frequency is not significant.

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x = df_nation_mid['Civ']+', '+df_nation_mid.index, y = df_nation_mid['National Frequency'], name = 'National' ))
fig.add_trace(go.Bar(x = df_nation_mid['Civ']+', '+df_nation_mid.index, y = df_nation_mid['Global Frequency'], name='Global'))
fig.update_layout(barmode='group',title_text = 'Civilization frequency by nation of the player (1000<Elo<1500)')
fig.show()
chisquare_nation(df_nation_mid,tot_mid)

At high level finally we see that players prioritising playing with their home civ are a relatively rare occurrence. Only Chinese and, surprisingly givien the previous result, Peruvians still pick their home civs at a much higher rate than normal. 

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x = df_nation_high['Civ']+', '+df_nation_high.index, y = df_nation_high['National Frequency'], name = 'National' ))
fig.add_trace(go.Bar(x = df_nation_high['Civ']+', '+df_nation_high.index, y = df_nation_high['Global Frequency'], name='Global'))
fig.update_layout(barmode='group',title_text = 'Civilization frequency by nation of the player (Elo>1500)')
fig.show()
chisquare_nation(df_nation_high,tot_high)

It is worth to note that there are relatively few high level player from each country, and with $37$ civs available in the game the specific pick frequencies rarely exceed $10\%$, so the behaviour of one or two players can significantly skew the data at this level.

In [None]:
df_high['steam_id.win'].loc[df_high['country.win']=='TR'].value_counts()
ids_high=df_high[['country.win','steam_id.win']].rename(columns={'country.win':'country','steam_id.win':'id'})
ids_high_l=df_high[['country.lose','steam_id.lose']].rename(columns={'country.lose':'country','steam_id.lose':'id'})
ids_high.append(ids_high_l)

unique_high=ids_high.groupby('country').nunique()['id'].sort_values(ascending=False)
df_nation_high=df_nation_high.join(unique_high)
df_nation_high.rename(columns={'id':'Unique players'},inplace=True)
df_nation_high['Unique players']

Interesting to see that no high-level South Korean player has played Koreans in any of the matches considered, but this is not extremely unusual, as Koreans are overall rarely used and only $104$ high level matches were played by South Korean players, so in total the expected number of times this would have happened choosing according to global frequencies is only $1.6$.

In [None]:
print(df_nation_high.loc['KR',:])
df_nation_high.loc['KR','National Total Matches']*df_nation_high.loc['KR','Global Frequency']