In [1]:
from pandas import (
    read_csv,
    DataFrame,
    merge,
)
from numpy import (
    std,
    nan,
)

In [2]:
df = read_csv(
    'ratings_fide_november_2020.csv',
    low_memory=False,
    encoding='latin-1',
)

Check 'descriptions.txt' for details on the fields. This was taken from here: https://ratings.fide.com/download_lists.phtml 
Field names in the XML file do not exactly match descriptions on the ratings page in FIDE site.
We will assume that 'games' mean the number of standard rated games and 'rating' means the rating of standard games

In [3]:
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,k,rapid_rating,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag
0,10688862,"A Abdel Maabod, Hoda",EGY,F,,,,,,,,,,,,,,2009.0,w
1,10224084,"A B M Hasibuzzaman, Tapan",BAN,M,,,,,,,,,,,,,,1977.0,
2,10245154,"A B M Jobair, Hossain",BAN,M,,,,,,,,1599.0,0.0,20.0,,,,1998.0,
3,10243054,"A B M Mustakim, Chowdhury",BAN,M,,,,,,,,,,,,,,2013.0,
4,25121731,A C J John,IND,M,,,,,1063.0,0.0,40.0,,,,,,,1987.0,


In [4]:
df[df['country'] == 'CHN']

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,k,rapid_rating,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag
21,8605360,"A La, Teng Hua",CHN,F,,,,,1915.0,0.0,40.0,,,,,,,1993.0,wi
37,8635170,"A, Sihan",CHN,M,,,,,,,,,,,,,,2007.0,
9681,8620270,"Abudureheman, Namaiti",CHN,M,,,,,2169.0,0.0,40.0,1994.0,0.0,20.0,,,,1986.0,
20186,8622930,Ai Kebaier Aikelamu,CHN,M,,,,,1843.0,0.0,40.0,,,,,,,1967.0,i
20188,8610010,"Ai, Erkengjiang",CHN,M,,,,,,,,,,,,,,2000.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982297,8630119,"Zuo, Shengyuan",CHN,M,,,,,,,,,,,,,,2010.0,
982299,8614610,"Zuo, Yifan",CHN,M,CM,,,,2330.0,0.0,20.0,2060.0,0.0,20.0,2055.0,0.0,20.0,2005.0,
982300,8620490,"Zuo, YiMing",CHN,F,,,,,1055.0,0.0,40.0,,,,,,,1990.0,wi
982302,8619832,"Zuo, Zhibo",CHN,M,,,,,1679.0,0.0,40.0,,,,,,,2003.0,i


In [5]:
df.shape

(983184, 19)

In [6]:
df['sex'].value_counts()

M    837331
F    145853
Name: sex, dtype: int64

# Cleaning

In [7]:
"""
Add a column with total games
"""
columns = [
    'games',
    'rapid_games',
    'blitz_games',
    'rating',
    'blitz_rating',
    'rapid_rating',
    'k',
    'rapid_k',
    'blitz_k',
]
"""
Replace nan values in these columns by 0
"""
for col in columns:
    df[col] = df[col].fillna(0)
"""
Replace nan values in titles with ''
"""
columns_title = [
    'title',
    'w_title',
    'o_title',
    'foa_title',
]
for col in columns_title:
    df[col] = df[col].fillna('')
"""
For flags, change nan to 'a' to denote active
"""
df['flag'] = df['flag'].fillna('a')

In [8]:
df['total_games'] = df['games']+df['rapid_games']+df['blitz_games']
df['average_rating'] = (df['rating']+df['rapid_rating']+df['blitz_rating'])/3.0
df['average_games'] = df['total_games']/3.0

In [9]:
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,...,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag,total_games,average_rating,average_games
0,10688862,"A Abdel Maabod, Hoda",EGY,F,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2009.0,w,0.0,0.0,0.0
1,10224084,"A B M Hasibuzzaman, Tapan",BAN,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1977.0,a,0.0,0.0,0.0
2,10245154,"A B M Jobair, Hossain",BAN,M,,,,,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,1998.0,a,0.0,533.0,0.0
3,10243054,"A B M Mustakim, Chowdhury",BAN,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2013.0,a,0.0,0.0,0.0
4,25121731,A C J John,IND,M,,,,,1063.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1987.0,a,0.0,354.333333,0.0


In [10]:
"""
Drop players who have no rating
"""
df = df.dropna(
    subset=[
        'rating',
    ],
)
df.shape

(983184, 22)

In [11]:
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,...,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag,total_games,average_rating,average_games
0,10688862,"A Abdel Maabod, Hoda",EGY,F,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2009.0,w,0.0,0.0,0.0
1,10224084,"A B M Hasibuzzaman, Tapan",BAN,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1977.0,a,0.0,0.0,0.0
2,10245154,"A B M Jobair, Hossain",BAN,M,,,,,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,1998.0,a,0.0,533.0,0.0
3,10243054,"A B M Mustakim, Chowdhury",BAN,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2013.0,a,0.0,0.0,0.0
4,25121731,A C J John,IND,M,,,,,1063.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1987.0,a,0.0,354.333333,0.0


In [12]:
df['sex'].value_counts()

M    837331
F    145853
Name: sex, dtype: int64

Drop players who are inactive. Also, why are there two different flags for inactivity based on gender, FIDE? What would be wrong with just one flag that says if the player is active or not?

In [13]:
df = df[~df['flag'].isin(['i','wi'])]
df = df.reset_index(drop=True)
df.shape

(814880, 22)

In [14]:
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,...,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag,total_games,average_rating,average_games
0,10688862,"A Abdel Maabod, Hoda",EGY,F,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2009.0,w,0.0,0.0,0.0
1,10224084,"A B M Hasibuzzaman, Tapan",BAN,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1977.0,a,0.0,0.0,0.0
2,10245154,"A B M Jobair, Hossain",BAN,M,,,,,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,1998.0,a,0.0,533.0,0.0
3,10243054,"A B M Mustakim, Chowdhury",BAN,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2013.0,a,0.0,0.0,0.0
4,25121731,A C J John,IND,M,,,,,1063.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1987.0,a,0.0,354.333333,0.0


In [15]:
df['sex'].value_counts()

M    687742
F    127138
Name: sex, dtype: int64

In [16]:
"""
Sort the rows by country
"""
df = df.sort_values(
    by=[
        'country',
    ],
)
df = df.reset_index(drop=True)

In [17]:
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,...,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag,total_games,average_rating,average_games
0,11701242,"Samim, Muhammad Mustafa",AFG,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1994.0,a,0.0,0.0,0.0
1,11702052,"Yaqeen, Madina",AFG,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2007.0,a,0.0,0.0,0.0
2,11701900,"Yaqeen, Muizzuddin",AFG,M,,,,,1536.0,0.0,...,0.0,20.0,1580.0,0.0,20.0,1980.0,a,0.0,1595.333333,0.0
3,11700440,"Zuhur Razmjo, Abdul",AFG,M,,,IA,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,a,0.0,0.0,0.0
4,11700327,"Yaqoubi, Naqib Ullah",AFG,M,,,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2000.0,a,0.0,0.0,0.0


In [18]:
"""
Take players who has at least one standard rated game
"""
df = df[df['total_games'] > 0]
df = df.reset_index(drop=True)
df.shape

(21642, 22)

In [19]:
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,...,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag,total_games,average_rating,average_games
0,11701552,"Mohtaat, Homayoun",AFG,M,,,,AIM,1791.0,8.0,...,0.0,20.0,1796.0,0.0,20.0,1972.0,a,8.0,1755.666667,2.666667
1,4705114,"Xhembulla, Aleksander",ALB,M,,,,,1739.0,0.0,...,8.0,20.0,0.0,0.0,0.0,2006.0,a,8.0,1143.333333,2.666667
2,4701984,"Shqau, Olta",ALB,F,,,,,1340.0,4.0,...,0.0,0.0,0.0,0.0,0.0,2002.0,w,4.0,446.666667,1.333333
3,4701585,"Shuqja, Klean",ALB,F,WCM,WCM,,,1905.0,12.0,...,0.0,20.0,1760.0,0.0,20.0,2006.0,w,12.0,1743.666667,4.0
4,4703669,"Ramaj, Ergit",ALB,M,,,,,1482.0,5.0,...,0.0,20.0,0.0,0.0,0.0,2008.0,a,5.0,972.0,1.666667


In [20]:
df['sex'].value_counts()

M    19557
F     2085
Name: sex, dtype: int64

In [21]:
genders = {
    'woman': 'F',
    'man': 'M',
}
dfs = {}
for gender in genders:
    dfs[gender] = df[df['sex'] == genders[gender]]
    dfs[gender] = dfs[gender].reset_index(drop=True)
    print(gender, dfs[gender].shape[0])

woman 2085
man 19557


In [22]:
dfs['woman'].head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,rating,games,...,rapid_games,rapid_k,blitz_rating,blitz_games,blitz_k,birthday,flag,total_games,average_rating,average_games
0,4701984,"Shqau, Olta",ALB,F,,,,,1340.0,4.0,...,0.0,0.0,0.0,0.0,0.0,2002.0,w,4.0,446.666667,1.333333
1,4701585,"Shuqja, Klean",ALB,F,WCM,WCM,,,1905.0,12.0,...,0.0,20.0,1760.0,0.0,20.0,2006.0,w,12.0,1743.666667,4.0
2,4700708,"Pasku, Roela",ALB,F,WFM,WFM,,,1907.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1990.0,w,2.0,635.666667,0.666667
3,4705807,"Ndreko, Ailin",ALB,F,,,,,0.0,0.0,...,5.0,20.0,0.0,0.0,0.0,2012.0,w,5.0,354.333333,1.666667
4,7900414,"Nouali, Djouher",ALG,F,,,,,1748.0,1.0,...,0.0,20.0,1728.0,0.0,20.0,1987.0,w,1.0,1713.333333,0.333333


In [23]:
for gender in genders:
    column = gender+'_country'
    dfs[column] = dfs[gender].groupby(
        by=[
            'country',
        ],
        as_index=False,
    )[['rating', 'title']].agg(lambda x: list(x))
    dfs[column]['rating'] = dfs[column]['rating'].apply(lambda x: sorted(x))
    dfs[column]['count'] = dfs[column]['rating'].apply(lambda x: len(x))
    dfs[column]['total'] = dfs[column]['rating'].apply(lambda x: sum(x))
    dfs[column]['average'] = dfs[column]['total']/dfs[column]['count']
    dfs[column]['std'] = dfs[column]['rating'].apply(lambda x: std(x))
    dfs[column] = round(dfs[column], 2)
    dfs[column] = dfs[column].sort_values(
        by=[
            'count',
            'average',
            'country',
        ],
    )
    dfs[column] = dfs[column].reset_index(drop=True)
    print(gender, dfs[column].shape[0])

woman 69
man 109


In [24]:
dfs['man_country'].head()

Unnamed: 0,country,rating,count,total,average,std
0,CHN,[0.0],1,0.0,0.0,0.0
1,HKG,[0.0],1,0.0,0.0,0.0
2,TPE,[1110.0],1,1110.0,1110.0,0.0
3,JPN,[1201.0],1,1201.0,1201.0,0.0
4,FIJ,[1568.0],1,1568.0,1568.0,0.0


In [25]:
dfs['woman_country'].head()

Unnamed: 0,country,rating,count,total,average,std
0,USA,[1105.0],1,1105.0,1105.0,0.0
1,FIJ,[1148.0],1,1148.0,1148.0,0.0
2,SWZ,[1368.0],1,1368.0,1368.0,0.0
3,TUN,[1463.0],1,1463.0,1463.0,0.0
4,SYR,[1581.0],1,1581.0,1581.0,0.0
