In [None]:


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
#import required library
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
countries = pd.read_csv('../input/esports-earnings-for-players-teams-by-game/country-and-continent-codes-list.csv')
players = pd.read_csv('../input/esports-earnings-for-players-teams-by-game/highest_earning_players.csv')
teams = pd.read_csv('../input/esports-earnings-for-players-teams-by-game/highest_earning_teams.csv')

## The data

In [None]:
players

In [None]:
teams

# Who are the top 10 players?  Where are they from?

To understand who are the top 10 players, we can see their earnings, the genre they dominated in, the games they dominated in and the country they are from

## Starting with earnings

What I did here basically was sorting the dataframe by the USD Prize won by players and showing the top 10 players who have the highest earnings

In [None]:
players = players.sort_values(by='TotalUSDPrize',ascending=False, ignore_index=True)
players.head(10)

It seems that the top 10 highest earning players are all playing Dota 2 and the genre is Multiplayer Online Battle Arena

In [None]:
sns.countplot(x='CountryCode',data=players.head(10))
plt.show()

Among the top 10 players, most of them originates from "fi" and the rest of them are from "dk", "au", "fr", "de", "jo", "bg" and "lb".   Let us see if we can find out what these countries actually are.

In [None]:
for country in list(players.head(10)['CountryCode'].unique()):
    name = countries[countries['Two_Letter_Country_Code']==country.upper()]['Country_Name'].to_string(index=False)
    #name is here to store the name of the country with reference to the country and continent code dataframe provided
    print(f'{country.upper()} refers to {name} ')

From this, we can conclude that Finland has the highest proportion of top 10 players and the rest of distributed evenly among Australia, France, Germany, Jordan, Bulgaria and Lebanon

# Conclusion
So basically what we've understood is that among the top 10 highest earning players, 3 of them are from Finaland, the rest of them are from different parts of the world.  Their game of choice is Dota 2 and the genre is Multiplayer Online Battle Arena

# Going forward
After understanding what the top 10 players are doing, I want to explore to see whether these conclusions can be applied to the rest of the ESports world.  Does MOBA bring the highest earnings?  Does Finland produce the highest earnings in the ESports world?  And do DOTA 2 bring the highest earnings?

## Starting with Genre

In [None]:
fig, ax = plt.subplots(figsize=(10,20))
plt.xticks(rotation = 45)
sns.boxplot(x='Genre',y='TotalUSDPrize',data=players)
plt.show()

While the top of the top players competing in MOBA do outearn all the other genres, First Person Shooter's median is higher than every other genre.

In [None]:
players.groupby('Genre').mean().sort_values(by='TotalUSDPrize',ascending=False) #average earnings

In [None]:
players.groupby('Genre').median().sort_values(by='TotalUSDPrize',ascending=False) #median earnings

In [None]:
players.groupby('Genre').sum().sort_values(by='TotalUSDPrize',ascending=False) #sum earnings

In [None]:
players.groupby('Genre').count().sort_values(by='TotalUSDPrize',ascending=False)['PlayerId']

### Team Perspective

In [None]:
teams.groupby('Genre').median().sort_values(by='TotalUSDPrize',ascending=False) #median earnings

In [None]:
teams.groupby('Genre').mean().sort_values(by='TotalUSDPrize',ascending=False) #average earnings

In [None]:
teams.groupby('Genre').sum().sort_values(by='TotalUSDPrize',ascending=False) #sum earnings

In [None]:
teams.groupby('Genre').sum().sort_values(by='TotalUSDPrize',ascending=False)

### Based of the various groupby tables, we have
While it's true that MOBO brings in the highest total USD Prize won and average earnings, the median earnings for individual FPS players are highest.  In addition, it should be mentioned that MOBA games has the highest number of competitors within the dataset.  This could signify 1 of 2 things.

1) The dataset is not representative of other generes

2) MOBA is indeed the most popular and thus could explain the higher amount of money earned by pros since there would be more viewership

## What about games within each genre?

In [None]:
p_games = list(players['Game'].unique())
p_games.sort()
p_games

In [None]:
t_games = list(teams['Game'].unique())
t_games.sort()
t_games

In [None]:
p_games == t_games #checking if both dataset contains the same games

Within the dataset, we have 10 games in total.  Both teams and players set has the same games

In [None]:
playerTotalPrize = players.groupby('Game').sum().sort_values(by='TotalUSDPrize',ascending=False)
playerTotalPrize

In [None]:
teamTotalPrize = teams.groupby('Game').sum().sort_values(by='TotalUSDPrize',ascending=False)
teamTotalPrize

In [None]:
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(12.5,12.5)) 

labels = teamTotalPrize.index
values = teamTotalPrize['TotalUSDPrize']
ax1.pie(values,labels = labels,autopct = '%1.1f%%') #plot first pie
ax1.set_title('Prize Earnings - Team POV')


labels = playerTotalPrize.index
values = playerTotalPrize['TotalUSDPrize']
ax2.pie(values,labels = labels,autopct = '%1.1f%%') #plot second pie
ax2.set_title('Prize Earnings - Team POV')

plt.show()

## Team vs Player earnings

For both team and player, DOTA 2 has the lion share of the earnings
However, when it comes to other games, there are variations between teams and players.
For example, team-focused games such as Overwatch and League of Legends seem to earn more of the pie when they're analyzed from a team's POV.

## Number of Tourneys by Games

In [None]:
tourneys = teams.groupby('Game')['TotalTournaments'].sum() #sum of all tourneys by games = total number of tourneys
tourneys.plot(kind='bar', ylabel = 'Number of tourneys', title = 'Number of Tournaments by Game')
plt.xticks(rotation = 45)
plt.show()

In [None]:
tourneys

## Tourneys conclusion

From this we can see that actually Starcraft II has the lion share of tournaments, followed by CSGO, Dota 2 and League of Legends.  In fact, Starcraft 2 has more than 2X as much tournaments as CSGO.  However, this is likely due to how long the game has been released (2010)

## Country and Continent

We have looked at both games and genres thus far.  Let's have a look at country and continent

While analyzing the top 10 players individually, I have made use of the country dataset to output the top 10 players countries.  Now it's time to some data engineering for the player dataset to see what could we infer about the country factor in ESports

In [None]:
players

In [None]:
countries

In [None]:
#get country name
def country_name(code):
    code = code.upper()
    name = countries[countries['Two_Letter_Country_Code']==code]['Country_Name'].to_string(index=False)
    return name 

In [None]:
#new column featuring country name
players['CountryName'] = players['CountryCode'].apply(country_name)

In [None]:
players

In [None]:
#continent function - similar logic to country function, except get continent name this time
def continent(code):
    code = code.upper()
    name = countries[countries['Two_Letter_Country_Code']==code]['Continent_Name'].to_string(index=False)
    return name 

In [None]:
players['ContinentName'] = players['CountryCode'].apply(continent)
players

In [None]:
players.info()

So far, we have added country names and continent names successfully without any null values.  However, I am not very happy about how some of the country names are displayed.  For example, 'Australia, Commenwealth of' can be simplified to Australia and it will convey the same information

In [None]:
players['CountryName'].unique()

On first glance, it seems that as long as we split by , and extract the first pieces of string, we can have what we want

In [None]:
#country name cleaning
def name_cleaning(country):
    country = country.split(',')
    return country[0]

In [None]:
players['CountryName'] = players['CountryName'].apply(name_cleaning)
players['CountryName'].unique()

So far so good

In [None]:
players.groupby('CountryName').sum().sort_values(by='TotalUSDPrize',ascending=False).head(10)

In [None]:
players.groupby('CountryName').mean().sort_values(by='TotalUSDPrize',ascending=False).head(10)

In [None]:
players.groupby('CountryName').median().sort_values(by='TotalUSDPrize',ascending=False).head(10)

In [None]:
players.groupby('ContinentName').sum().sort_values(by='TotalUSDPrize',ascending=False)

In [None]:
players.groupby('ContinentName').mean().sort_values(by='TotalUSDPrize',ascending=False)

In [None]:
players.groupby('ContinentName').median().sort_values(by='TotalUSDPrize',ascending=False)

## Country and Continent

Interestingly, it seems that China has the most amount of ESports earnings.  If we look at Continent, Asia and Europe largely dominates in terms of total USD earned.  However, if we look at it from other dimension.  Countries like Lebonon and Parkistan take the cake.

I have a few hypothesis for this.

1) Countries like China and Korea probably have a way higher population that are into ESports, thus resulting in higher total earnings

2) Countries like Parkistan and other less developed countries probably aren't as accepting of ESports culture, so you would have to be really good to get into it, which explains higher median and average earnings

In [None]:
players.groupby('CountryName').count().sort_values(by='PlayerId',ascending=False).head(10)['PlayerId']

In [None]:
players.groupby('CountryName').count().sort_values(by='PlayerId',ascending=False).tail(10)['PlayerId']

In fact when we look at the player count of each country, that seems to confirm what I suspected.  Some of the lower player count country has "elite" players while popular countries have sheer numbers

## Something Interesting to close with

Recently, I was reading Practical Statistics for Data Scientists: 50+ Essential Concepts Using R and Python by Andrew Bruce, Peter C. Bruce, and Peter Gedeck.  Part of it talks about the use of K Nearest Neighbor as a sort-of quasi-probability feature engineering (for example, using predictors with KNN to predict a pseudo credit score).  I think this could be applied in this case too.  As someone who is also a gamer, I wanna see my own probability based on my game of choice, continent and country what are my probability of making it as a player.

To do this, I think I first need to:

1) Delete some of the extraordinary earnings as seen by Dota 2 players.  If you remember the boxplot in the beginning, Dota 2 has the highest amount of outliers

2) Define a cutoff point for "making it as an ESport players" - a certain amount of total earning prize.

In [None]:
players.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,20))
plt.xticks(rotation = 45)
sns.boxplot(x='Genre',y='TotalUSDPrize',data=players)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.distplot(players['TotalUSDPrize'])
plt.show()

Given this distribution, right now I am thinking that I will probably be setting anything above 2000000 as an outlier so as not to skew my analysis too much

In [None]:
players_knn = players[players['TotalUSDPrize'] < 2000000]
players_knn

So far that's only 29 players lost so that's good

In [None]:
players_knn.mean()

## Predictor for KNN analysis

Let's assume the cutoff point for success to be the mean - $305168.103028, an ESports personality typically has sponsorship deals to supplement his earnings so 300,000 or so + sponsorship deals would serve as a sizeable savings for potential retirement from ESports and working a less intensive career.

It's also safe to say that PlayerId and their names play no relevance to this practice.  And from our analysis previously, let's try to use game, genre, country as our predictors

In [None]:
#engineering outcome for KNN

def outcome(prize):
    if prize > 305168.103028:
        return 1
    else:
        return 0

In [None]:
players_knn['Outcome'] = players['TotalUSDPrize'].apply(outcome)
players_knn

In [None]:
#required library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
predictors = ['Game','Genre','CountryName','ContinentName']
outcome = 'Outcome'

In [None]:
X = players_knn[predictors]
y = players_knn[outcome]

In [None]:
#for KNN to work, we need to use LabelEncoder to transform the string to numerical values

game_le = LabelEncoder()
genre_le = LabelEncoder()
country_le = LabelEncoder()
continent_le = LabelEncoder()

In [None]:
X['Game'] = game_le.fit_transform(X['Game'])
X['Genre'] = genre_le.fit_transform(X['Genre'])
X['CountryName'] = country_le.fit_transform(X['CountryName'])
X['ContinentName'] = continent_le.fit_transform(X['ContinentName'])
X

In [None]:
#initiating knn - fitting all and no train-test-split since we are not training a model, simply using it as a pseudo probability

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X, y)

In [None]:
players_knn['PlayerScore'] = knn.predict_proba(X)[:,1] #1 for probability of outcome = 1

In [None]:
players_knn

Now we have a column showing how likely someone would make it as an ESports player given the set of parameters - genre, country, game and continent

## Just For Fun

Let's say I am someone from Canada.  I love Overwatch and First Person Shooter.  Let's see my score!

In [None]:
game = ['Overwatch']
genre = ['First-Person Shooter']
country = ['Canada']
continent = ['North America']

me = np.array([game_le.transform(game)[0],genre_le.transform(genre)[0],country_le.transform(country)[0],continent_le.transform(continent)[0]])
me

In [None]:
knn.predict_proba(me.reshape(1, -1))[:,1]

Only 0.1 chance.  Oh well, back to working on my Python and Data Science skills then :p