In [1]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Ranking Data

In [2]:
players = pd.read_pickle('../Resources/tennis_clean/atp_players.pkl')

In [3]:
ranking_files = glob.glob('../Resources/tennis_atp-master/atp_rankings_[0-9]*')

In [4]:
rankings_list = [pd.read_csv(ranking_file, parse_dates=['ranking_date']) for ranking_file in ranking_files]

In [5]:
rankings = pd.concat(rankings_list)

# 'rank' is a pandas function, so take care not to name any columns as such.
rankings.columns = ['ranking_date', 'player_rank', 'player_id', 'ranking_points']

rankings.player_rank = rankings.player_rank.astype('Int64')
rankings.ranking_points = rankings.ranking_points.astype('Int64')

rankings.set_index('player_id', inplace=True)

rankings.info()
rankings.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2920823 entries, 101736 to 100368
Data columns (total 3 columns):
 #   Column          Dtype         
---  ------          -----         
 0   ranking_date    datetime64[ns]
 1   player_rank     Int64         
 2   ranking_points  Int64         
dtypes: Int64(2), datetime64[ns](1)
memory usage: 94.7 MB


Unnamed: 0_level_0,ranking_date,player_rank,ranking_points
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101736,2000-01-10,1,4135
102338,2000-01-10,2,2915
101948,2000-01-10,3,2419
103017,2000-01-10,4,2184
102856,2000-01-10,5,2169


In [6]:
rankings.duplicated().value_counts()

False    1661437
True     1259386
dtype: int64

In [7]:
rankings['player_age'] = rankings.ranking_date - players.loc[rankings.index].birthdate

rankings.reset_index(inplace=True)
rankings.set_index(['player_id', 'ranking_date'], inplace=True)
rankings.sort_index(inplace=True)

rankings.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2920823 entries, (100001, Timestamp('1977-07-04 00:00:00')) to (209939, Timestamp('2020-12-28 00:00:00'))
Data columns (total 3 columns):
 #   Column          Dtype          
---  ------          -----          
 0   player_rank     Int64          
 1   ranking_points  Int64          
 2   player_age      timedelta64[ns]
dtypes: Int64(2), timedelta64[ns](1)
memory usage: 84.3 MB


In [8]:
rankings['best_rank'] = (rankings.reset_index()
                                .groupby('player_id')
                                .player_rank.cummin()
                                .astype('Int64')
                                .values)

rankings.reset_index('ranking_date', inplace=True)
rankings.head()

Unnamed: 0_level_0,ranking_date,player_rank,ranking_points,player_age,best_rank
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100001,1977-07-04,366,,23235 days,366
100001,1978-01-02,414,,23417 days,366
100001,1978-01-16,397,,23431 days,366
100002,1975-03-10,355,,19621 days,355
100002,1975-04-14,383,,19656 days,355


In [10]:
rankings.to_pickle('../Resources/tennis_clean/atp_rankings.pkl')