In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from unidecode import unidecode
import seaborn as sns
import warnings


%matplotlib notebook
warnings.filterwarnings('ignore')

## Getting a List of All Right Backs

In [2]:
df = pd.read_table('RB.txt', skiprows = 0, header = None)
df = df.fillna(axis = 'index', method = 'ffill')
df = df.drop_duplicates(subset = 0, keep = 'last')

In [3]:
chng = {
    0 : 'Player',
    1:'Country',
    5:'Age',
    7:'Club'
}

cols = list(chng.values())

In [4]:
df = df.rename(axis = 'columns', mapper = chng)[cols]
df.to_csv('RB2019.csv')

In [5]:
#If you want more, start from the next page of 
# https://www.fifaindex.com/players/8/?position=3&order=desc
# and paste it into RB.txt

## Getting and Cleaning the Statistics

In [6]:
stats = pd.read_csv('RBStats2019.csv')
stats = stats.set_index('Player Name')

In [7]:
fdf = df.join(stats, on = 'Player')

Need to Convert the foreign characters to english

In [8]:
names = df['Player'].values

#Decoding and concerting to plain text, and having dictionaries to convert between
plydict = {i: unidecode(i) for i in names}
realdict = {unidecode(i): i for i in names}
df['Player'] = list(plydict.values())

Mapping Single Named Players with Two Names

In [9]:
#All the Players who are there in the stats dataset but NOT in Right Back Dataset
missing = fdf['Team'].isna()

#All the players in the stats dataset
substr_list = stats.index.values.astype('str')

#All Player in RB dataset
names = list(plydict.values())

In [10]:
conv = {}

#For all players in the stats dataset
for i in substr_list:
    
    #Get Last Name
    s = i.split()[-1]
    
    #For all player in the RB Dataset
    for x in names:
        
        if x == s:
            conv[x] = i

In [11]:
df['Player'] = df['Player'].replace(conv)
fdf = df.join(stats, on = 'Player')

Getting Players with data only

In [12]:
missing = fdf['Team'].isna()
fdf = fdf[~missing]

Combining league and UCL data

In [13]:
fdf.columns

Index(['Player', 'Country', 'Age', 'Club', 'Team', 'League', 'POS', 'GP',
       'MIN', 'Y', 'YR', 'R', 'A', 'SA', 'ACR', 'FS', 'AP', 'ACRO', 'BCC',
       'DR', 'APW', 'TBOX', 'ECL'],
      dtype='object')

In [653]:
req = [
    'Player', 'Country', 'Age', 'Club','League', 'GP','MIN','Y',
    
    'A',
    'SA',
    'ACR',
    'FS',
    'AP', 'ACRO', 'BCC',
       'DR', 'APW', 'TBOX',
]

In [654]:
adf = fdf.groupby(req[:4])[req[5:]].sum()
adf = adf.reset_index()

## Interpreting the Data

In [803]:
reqs = [
    'Player',
    'Club',
    'Country',
    'MIN',
    'Age'
]

def per90(df1 ,df2, x):
    for i in x:
        df1[i + '90'] = df2[i]/df2['MIN'] * 90
        
    return df1

def sum_vals(d):
    n = len(sts90)
    sm = 0
    for i in range(n):
        wt = d[i]/mxs[i]
        sm = sm + wt * d[i]
    return sm

def sum_wts(d):
    n = len(sts90)
    sm = 0
    for i in range(n):
        wt = d[i]/mxs[i]
        sm = sm + wt
    return sm



df = adf[reqs]
sts = ['BR','TKLW', 'INT', 'BLK']
sts90 = [i + '90' for i in sts]

#Convert to per 90 stats
df = per90(df, adf, sts)

mxs = [df[i].max() for i in sts90]

#Get Sum of Values, and Sum of their weights
df['SumVal'] = df[sts90].apply(sum_vals, axis = 1)
df['SumWt'] = df[sts90].apply(sum_wts, axis = 1)


In [804]:
#Include only players who have played certion number of minutes

logic = df['MIN'] > 900
xdf = df[logic]

In [805]:
xdf = xdf.sort_values(by = 'SumVal', ascending=False)

## Visalizing Data

In [806]:
#How many players you want
vdf = xdf.iloc[:30]

In [807]:
sns.set()

#ax = vdf.plot.scatter('SumWt', 'SumVal', figsize = (10, 10))

fig = plt.figure(figsize = (10, 10))

vdf = vdf.sort_values(by = 'SumWt', ascending=True)
n = vdf['Player'].values

x = vdf['SumWt'].values
y = vdf['SumVal'].values

config = {
    'ha': 'center', 
    'va': 'center'
}

for i, txt in enumerate(n):
    if i != 0:
        disx = x[i] - x[i-1]
        if disx > 0.0065:
            plt.scatter(x[i], y[i], color = 'red')
            plt.text(x[i], y[i]+ 0.1, txt,config,  fontsize = 9, rotation = 0)
    else:
        plt.scatter(x[i], y[i], color = 'red')
        plt.text(x[i], y[i]+ 0.1, txt, config,  fontsize = 9, rotation = 0)

plt.arrow(2.85, 8.8, 0.1, 0, width = 0.07, head_length = 0.01, color = 'black')
plt.text(2.78, 8.9, "Increasing Defensive Traits", fontsize = 10)

plt.arrow(1.93, 15.5, 0, 1, width = 0.01, head_length = 0.08, color = 'black' )
plt.text(1.95, 16, "Increasing Defensive\n Performances", fontsize = 10)



fig = plt.gcf()
ax = plt.gca()
ax.set_xlabel('Sum Of Weights')
ax.set_ylabel('Sum of Weighted Values')
ax.set_title('Best Defensive Right Backs in Europe (Minimum 900 Minutes)')
fig.savefig('rb.jpg', dpi = 300, facecolor = 'white')

<IPython.core.display.Javascript object>

## Provide Ranking System

In [802]:
xdf.set_index('Player').loc['Thomas Meunier']

Club              Paris
Country         Belgium
MIN                1888
Age                  27
BR90            6.00636
DW90            4.43326
TKLW90          1.47775
INT90            1.0964
BLK90          0.143008
SumVal          6.31458
SumWt           1.79415
BR90 Rank            36
DW90 Rank           100
TKLW90 Rank          50
INT90 Rank          102
BLK90 Rank          109
Name: Thomas Meunier, dtype: object

In [760]:
def get_ranks(df, sts90):
    for i in sts90:
        df[i + ' Rank'] = df[i].rank(ascending = False)
        
    return df
xdf = get_ranks(xdf, sts90)