In [11]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from scipy import stats
import matplotlib.pyplot as plt

## import the data (1991 - 2021)

In [12]:
df_atp = pd.read_csv('tennis_atp-master/atp_matches_1991.csv')
for i in range(2,10):
    df = pd.read_csv(f'tennis_atp-master/atp_matches_199{i}.csv')
    df_atp = pd.concat([df_atp,df], ignore_index=True)
for i in range(0, 23):
    b = ''
    if i < 10:
        b = 0
    df = pd.read_csv(f'tennis_atp-master/atp_matches_20{b}{i}.csv')
    df_atp = pd.concat([df_atp,df], ignore_index=True)

In [13]:
df_atp = df_atp.sort_values(by = ['tourney_date', 'match_num'], ignore_index=True)
#delete samll tournaments, davis, olympics, retirements and walkovers
df_atp = df_atp.loc[(df_atp['draw_size']>=16) & (df_atp['round']!= 'BR') & 
                    (df_atp['tourney_name'].str.contains("Olympics") == False) &
                    (df_atp['score'].str.contains("W/O") == False) &
                    (df_atp['score'].str.contains("RET") == False) &
                    (df_atp['score'].str.contains("RET") == False) &
                    (df_atp['score'].str.contains("Played and unfinished") == False)]

In [14]:
def get_smaller_than(data, date): # date: year month day
    I = data.loc[:, 'tourney_date'] <= int(f'{date}')
    I = np.asarray(I)
    return data.loc[I]
def get_bigger_than(data, date): # date: year month day
    I = data.loc[:, 'tourney_date'] >= int(f'{date}')
    I = np.asarray(I)
    return data.loc[I]

In [15]:
df_atp = df_atp.drop(['winner_seed', 'winner_entry', 'loser_seed','loser_entry'], axis=1)
df_atp = df_atp.dropna()
df_atp = df_atp.reset_index(drop=True)
print(df_atp)

      tourney_id    tourney_name surface  draw_size tourney_level  \
0       1991-301        Auckland    Hard         32             A   
1       1991-301        Auckland    Hard         32             A   
2       1991-338  Sydney Outdoor    Hard         32             A   
3       1991-301        Auckland    Hard         32             A   
4       1991-338  Sydney Outdoor    Hard         32             A   
...          ...             ...     ...        ...           ...   
78549  2021-0429       Stockholm    Hard         32             A   
78550  2021-0429       Stockholm    Hard         32             A   
78551  2021-0429       Stockholm    Hard         32             A   
78552  2021-0429       Stockholm    Hard         32             A   
78553  2021-0429       Stockholm    Hard         32             A   

       tourney_date  match_num  winner_id             winner_name winner_hand  \
0          19910107          1     101142          Emilio Sanchez           R   
1        

In [16]:
df_2021 = get_bigger_than(df_atp, 20201231)
df_2021 = df_2021.reset_index(drop=True)

In [17]:
# fix the dataset
df_tennis = df_atp

In [18]:
#Balanced dataset
# indexes of won matches player0 -> I_0 and player -> I_1
def split_target(data):
    I_0 = np.sort(data.sample(frac=0.50).index)
    I_1 = np.arange(data.index[0], data.index[-1] + 1)
    I_1 = np.delete(I_1, I_0) 
    return I_0, I_1

In [19]:
I_0,I_1 = split_target(df_tennis)

In [20]:
def generate_data_on_Nan(column):
    m = df_tennis[column].mean()
    std = df_tennis[column].std()
    print(m, std)
    for ind in df_tennis.index:
        if np.isnan(df_tennis.at[ind, column]):
            df_tennis.at[ind, column] = np.random.normal(m, std)

We will deal with Nans only then, when we are conducting some function on them. When Nan is for example in height, we will take care of it later.

# Extracting features

#### abbreviation (dopisz tu swoje zmiany + zaczalem robic points_features (pozbylem nanow) to mozesz tam dokonczyc
Player_features: (sum = 88)

straightforward: 2 + 2 + 2 + 2 +2 = 10, wins&losses: 2 x 2 x 3 x 3 = 36, 2 x 2 x 2 x 3 = 25, 2 x 2 = 4, 2 x 2 = 4, activity: 2 x 3 = 6, 2 x 1 = 2, elo: 2 x 1 = 2, points_statistics: KRZYCHUKRZYCHUKRZYCHU

Player_percents: (sum = 36)

3 x 3 x 2 = 18, 2 x 3 x 2 = 12, 2 x 2 = 4, 2 x 1 = 2

Match_features: (sum = 6)

1 + 1 + 1 + 1 + 1 + 1 = 6

Summing up: 130 + 1 (target column)

# Player features

In [21]:
player0_features = dict()
player1_features = dict()

In [47]:
def check_target(ind, ind_boost, player0_features, player1_features):
    if ind_boost < len(I_1) and ind == I_0[ind_boost]:
        ind_boost += 1
        winner = player1_features
        loser = player0_features
    else:
        winner = player0_features
        loser = player1_features 
    return ind_boost, winner,loser

def normalize(data):
    return (data-data.mean()) / data.std()

# get period of time
def before_date_match(data, date, match_num): 
    I = data.loc[:, 'tourney_date'] <= int(date)
    
    I_delete = data.loc[:, 'tourney_date'] == int(date)
    I_delete &= data.loc[:, 'match_num'] >= match_num
    I =  I^I_delete
    I = np.asarray(I)
    return data.loc[I]

def semester_back(date):
    year = date // 10000
    month = (date // 100) % 100
    day = date % 100
    if month < 7:
        year -= 1
    month = (month - 6) % 12 #(-1 % 12 = 11)
    if month == 0:
        month = 12
    ans = ("%04d%02d%02d" % (year,month,day))
    return int(ans)

def get_wins(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period = period['winner_id'].value_counts()
    if player_id in period:
        return period[player_id]
    else:
        return 0

    
def get_losses(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period = period['loser_id'].value_counts()
    if player_id in period:
        return period[player_id]
    else:
        return 0
    
def get_round_types(data, types, draw):
    I = data.loc[:, 'draw_size'] == draw
    I_t = data.loc[:, 'round'] == types[0]
    for t in types[1:]:
        I_t |= data.loc[:, 'round'] == t
    return I & I_t

def get_level(data, level):
    if level == 'level1':
        I_128 = get_round_types(data, ['F','SF','QF'], 128)
        I_64 = get_round_types(data, ['F','SF'], 64)
        I_32 = get_round_types(data, ['F'], 32) | get_round_types(data, ['F'], 16)
        I = I_128 | I_64 | I_32
    if level == 'level2':
        I_128 = get_round_types(data, ['R16','R32'], 128)
        I_64 = get_round_types(data, ['QF','R16'], 64)
        I_32 = get_round_types(data, ['SF','QF'], 32) | get_round_types(data, ['SF','QF'], 16)
        I = I_128 | I_64 | I_32
    if level == 'level3':
        I_128 = get_round_types(data, ['R64','R128'], 128)
        I_64 = get_round_types(data, ['R32','R64'], 64)
        I_32 = get_round_types(data, ['R16', 'R32'], 32) | get_round_types(data, ['R16', 'R32'], 16)
        I = I_128 | I_64 | I_32
    return data.loc[I]


def get_match_w_counter(data,date,num,player_id):
    period = before_date_match(data, date, num)
    period = period['winner_id'].value_counts()
    if player_id in period:
        return period[player_id]
    else:
        return 0
def get_match_l_counter(data,date,num,player_id):
    period = before_date_match(data, date, num)
    period = period['loser_id'].value_counts()
    if player_id in period:
        return period[player_id]
    else:
        return 0

def get_w_set_in_match(score):
    score = score.split(" ")
    counter = 0
    for sets in score:
        if len(sets)==5 and sets[2]=='-':
            if int(sets[0:1]) > int(sets[3:4]):
                counter = counter + 1
        elif len(sets)==3:
            if sets[0].isnumeric() and sets[2].isnumeric():
                if int(sets[0]) > int(sets[2]):
                    counter = counter + 1
    return counter

def get_l_set_in_match(score):
    score = score.split(" ")
    counter = 0
    for sets in score:
        if len(sets)==5 and sets[2]=='-':
            if int(sets[0:1]) < int(sets[3:4]):
                counter = counter + 1
        elif len(sets)==3:
            if sets[0].isnumeric() and sets[2].isnumeric():
                if int(sets[0]) < int(sets[2]):
                    counter = counter + 1
    return counter

def get_w_game_in_match(score):
    score = score.split(" ")
    counter = 0
    for sets in score:
        if len(sets)==3:
            if sets[0].isnumeric() and sets[2].isnumeric():
                counter = counter + int(sets[0])
    return counter

def get_l_game_in_match(score):
    score = score.split(" ")
    counter = 0
    for sets in score:
        if len(sets)==3:
            if sets[0].isnumeric() and sets[2].isnumeric():
                counter = counter + int(sets[2])
    return counter

def get_w_game(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period = period.loc[period['winner_id']== player_id]
    return sum(period['score'].apply(get_w_game_in_match))

def get_l_game(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period = period.loc[period['loser_id']== player_id]
    return sum(period['score'].apply(get_l_game_in_match))


def get_w_set(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period = period.loc[period['winner_id']== player_id]
    return sum(period['score'].apply(get_w_set_in_match))

def get_l_set(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period = period.loc[period['loser_id']== player_id]
    return sum(period['score'].apply(get_l_set_in_match))

$$\{Player_1, Player_2\} \times \{winsGame, lossesGame\} \times \{semester,year,career\} \times \{level1, level2, level3\}$$

In [23]:
for result in ['wins_game', 'losses_game']:
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            player0_features[f'{result}_{time}_{level}'] = []
            player1_features[f'{result}_{time}_{level}'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            spec_data = get_level(df_tennis, level)
            winner[f'wins_game_{time}_{level}'].append(get_w_game(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'wins_game_{time}_{level}'].append(get_w_game(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
            winner[f'losses_game_{time}_{level}'].append(get_l_game(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'losses_game_{time}_{level}'].append(get_l_game(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
            
            

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




$$\{Player_1, Player_2\} \times \{winsSet, lossesSet\} \times \{semester,year,career\} \times \{level1, level2, level3\}$$

In [49]:
for result in ['wins_set', 'losses_set']:
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            player0_features[f'{result}_{time}_{level}'] = []
            player1_features[f'{result}_{time}_{level}'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            spec_data = get_level(df_tennis, level)
            winner[f'wins_set_{time}_{level}'].append(get_w_set(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'wins_set_{time}_{level}'].append(get_w_set(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
            winner[f'losses_set_{time}_{level}'].append(get_l_set(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'losses_set_{time}_{level}'].append(get_l_set(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




$$\{Player_1, Player_2\} \times \{matchCounter\} $$

In [None]:

player0_features[f'match_counter'] = []
player1_features[f'match_counter'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    winner[f'match_counter'].append(get_match_w_counter(df_tennis, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
    loser[f'match_counter'].append(get_match_l_counter(df_tennis, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))


#### straightforward data
$$\{Player_1, Player_2\} \times \{age\}$$
$$\{Player_1, Player_2\} \times \{height\}$$
$$\{Player_1, Player_2\} \times \{hand\}$$
$$\{Player_1, Player_2\} \times \{rank\}$$
$$\{Player_1, Player_2\} \times \{rank\_points\}$$

In [None]:
#adding features
for feature in ['age', 'ht', 'hand', 'rank', 'rank_points']:
    player0_features[feature] = []
    player1_features[feature] = []

target = []
ind_boost = 0 # boost chechikng which player was assigned to winner
for ind in df_tennis.index: 
    ind_boost_b = ind_boost
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)   
    
    if ind_boost > ind_boost_b:
        target.append(1)
    else:
        target.append(0)
    for feature in ['age', 'ht', 'hand', 'rank', 'rank_points']:
        if feature == 'hand':
            winner[feature].append(int(df_tennis.at[ind, f'winner_{feature}'] == 'R'))
            loser[feature].append(int(df_tennis.at[ind, f'loser_{feature}'] == 'R'))
        else:
            winner[feature].append(df_tennis.at[ind, f'winner_{feature}'])
            loser[feature].append(df_tennis.at[ind, f'loser_{feature}'])
# Y[i] is result of X[i] match
Y = pd.DataFrame(target, columns=['target'])


#### wins in semester, year and career
$$\{Player_1\}\times\{Player_2\}\times\{wins,losses\}\times\{level_1,
level_2,level_3\}\times\{semester,year,career\}$$
level1,level2,level3 - macthes types: level1: {F,SF,QF}slam ,  {F,SF} 1000, {F}500
level2: {R16, R32} slam, {QF, R16} 1000, {SF, QF}500 
level3: {R64, R128} slam, {R32, R64} 1000, {R16, R32}500

In [25]:
# adding features
for result in ['wins', 'losses']:
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            player0_features[f'{result}_{time}_{level}'] = []
            player1_features[f'{result}_{time}_{level}'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            spec_data = get_level(df_tennis, level)
            winner[f'wins_{time}_{level}'].append(get_wins(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'wins_{time}_{level}'].append(get_wins(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
            winner[f'losses_{time}_{level}'].append(get_losses(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'losses_{time}_{level}'].append(get_losses(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




#### wins on surfaces
$$\{Player_1, Player_2\} \times \{wins,losses\}\times\{semester, career\}\times\{clay,hardcourt,grass\}$$ carpet is no more used in ATP 

In [26]:
def get_surface(data, surface):
    I = data.loc[:, 'surface'] == surface
    return data.loc[I]

In [27]:
# adding features
for result in ['wins', 'losses']:
    for time in ['semester', 'career']:
        for surface in ['Hard', 'Clay', 'Grass']:
            player0_features[f'{result}_{time}_{surface}'] = []
            player1_features[f'{result}_{time}_{surface}'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'career']:
        for surface in ['Hard', 'Clay', 'Grass']:
            spec_data = get_surface(df_tennis, surface)
            winner[f'wins_{time}_{surface}'].append(get_wins(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                        df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'wins_{time}_{surface}'].append(get_wins(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                        df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
            winner[f'losses_{time}_{surface}'].append(get_losses(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                        df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
            loser[f'losses_{time}_{surface}'].append(get_losses(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                                        df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




#### head-to-head statistics
$$\{Player_1, Player_2\} \times \{year,career\}\times\{wins\}$$

In [50]:
# calculate  #wins head-to-head in period
def get_hth_wins(data, time, date, num, player_id, against_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    
    groups = period.groupby(by=['winner_id', 'loser_id']).size()
    if (player_id, against_id) in groups:
        return groups[(player_id, against_id)]
    else:
        return 0

In [51]:
# adding features
for time in ['year', 'career']:
    player0_features[f'hth_wins_{time}'] = []
    player1_features[f'hth_wins_{time}'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['year', 'career']:
        winner[f'hth_wins_{time}'].append(get_hth_wins(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                            df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id'], df_tennis.at[ind, 'loser_id']))
        loser[f'hth_wins_{time}'].append(get_hth_wins(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                            df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id'], df_tennis.at[ind, 'winner_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




#### wins in current tournament
$$\{Player_1, Player_2\}\times\{wins,losses\}$$

In [52]:
def get_tournament(data, tourney):
    I = data.loc[:, 'tourney_name'] == tourney
    return data.loc[I]   

In [53]:
# adding features
for result in ['wins', 'losses']:
    player0_features[f'tournament_{result}'] = []
    player1_features[f'tournament_{result}'] = []
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    spec_data = get_tournament(df_tennis, df_tennis.at[ind, 'tourney_name'])
    time = 'career' # num of wins during whole career
    winner[f'tournament_wins'].append(get_wins(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
    loser[f'tournament_wins'].append(get_wins(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
    winner[f'tournament_losses'].append(get_losses(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
    loser[f'tournament_losses'].append(get_losses(spec_data, time, df_tennis.at[ind, 'tourney_date'], 
                df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




#### minutes played recently
$$\{Player_1, Player_2\}\times\{last\_match, in\_tournament, semester\}\times\{minutes\_played\}$$

In [54]:
# generate gaussian minutes if there is nan
print(df_tennis['minutes'].isna().sum())
#generate_data_on_Nan('minutes')

0


In [55]:
def get_minutes(data, time, date, num, player_id, toruney_name):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'in_tournament':
        period = get_bigger_than(data, date)
    if time == 'last_match':
        period = get_bigger_than(data, date)
        period = before_date_match(period, date, num)
        for ind in period.index[::-1]:
            if period.at[ind, 'winner_id'] == player_id:
                return period.at[ind, 'minutes']
        
    period = before_date_match(period, date, num)
    g1 = period.groupby(['winner_id'])['minutes'].agg('sum')
    g2 = period.groupby(['loser_id'])['minutes'].agg('sum')
    ans = 0
    
    if player_id in g1:
        ans += g1[player_id]
    if player_id in g2:
        ans += g2[player_id]
        
    return ans

In [56]:
# adding features
for time in ['last_match', 'in_tournament', 'semester']:
    player0_features[f'minutes_played_{time}'] = []
    player1_features[f'minutes_played_{time}'] = []

ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['last_match', 'in_tournament', 'semester']:
        toruney_name = df_tennis.at[ind, 'tourney_name']
        winner[f'minutes_played_{time}'].append(get_minutes(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
            df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id'], toruney_name))
        loser[f'minutes_played_{time}'].append(get_minutes(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
            df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id'], toruney_name))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




#### number of weeks inactive
$$\{Player_1, Player_2\}\times\{inactive\_weeks\}$$

In [57]:
from datetime import date
def get_break_time(date1, date2):
    year1 = date1 // 10000
    month1 = (date1 // 100) % 100
    day1 = date1 % 100
    year2 = date2 // 10000
    month2 = (date2 // 100) % 100
    day2 = date2 % 100
    d1 = date(year1, month1, day1)
    d2 = date(year2, month2, day2)
    return (d1 - d2).days / 7
    

In [58]:
def get_last_activity(data, date, num, player_id):
    date = int(date)
    # cut data to 6months year
    new_date = semester_back(date)
    period = get_bigger_than(data, new_date)
    period = before_date_match(period, date, num)
    g1 = period.groupby('winner_id')['tourney_date'].agg(min)
    g2 = period.groupby('loser_id')['tourney_date'].agg(min)
    last = date
    if player_id in g1:
        last = g1[player_id]
    if player_id in g2:
        last = min(last, g2[player_id])
    return get_break_time(date, int(last))

In [59]:
# adding features
player0_features[f'inactive_weeks'] = []
player1_features[f'inactive_weeks'] = []

ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    winner[f'inactive_weeks'].append(get_last_activity(df_tennis, df_tennis.at[ind, 'tourney_date'], 
        df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
    loser[f'inactive_weeks'].append(get_last_activity(df_tennis, df_tennis.at[ind, 'tourney_date'], 
        df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




#### Serve and return stats
$$\{Player_1, Player_2\} \times \{PercentServeWins, PercentServeIn, PercentReturnWins\} \times \{semester,year,career\}$$

In [None]:
# generate gaussian points if there is nan
print(df_tennis['w_ace'].isna().sum(),df_tennis['l_ace'].isna().sum())
print(df_tennis['w_svpt'].isna().sum(),df_tennis['l_svpt'].isna().sum())
print(df_tennis['w_1stWon'].isna().sum(),df_tennis['l_1stWon'].isna().sum())
print(df_tennis['w_2ndWon'].isna().sum(),df_tennis['l_2ndWon'].isna().sum())
for c in ['w_ace', 'l_ace', 'w_1stWon', 'w_2ndWon', 'l_1stWon', 'l_2ndWon'
         'w_svpt', 'l_svpt']:
    pass
    #generate_data_on_Nan(c)

In [60]:
def get_percent_svpt(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period_1 = period.loc[period['winner_id']==player_id]
    period_2 = period.loc[period['loser_id']==player_id]
    if not period_1.empty and not period_2.empty:
        return (sum(period_2['l_1stWon']) + sum(period_2['l_2ndWon']) + sum(period_1['w_1stWon'])
                + sum(period_1['w_2ndWon']) ) / (sum(period_1['w_svpt']) + sum(period_2['l_svpt']))
    else:
        return 0

def get_percent_rtpt(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period_1 = period.loc[period['winner_id']==player_id]
    period_2 = period.loc[period['loser_id']==player_id]
    if not period_1.empty and not period_2.empty: 
        return (sum(period_1['l_svpt']) - sum(period_1['l_1stWon']) - sum(period_1['l_2ndWon']) + 
                sum(period_2['w_svpt']) - sum(period_2['w_1stWon']) - sum(period_2['w_2ndWon'])) / (sum(period_2['w_svpt']) + sum(period_1['l_svpt']))
    else:
        return 0
    
def get_percent_insv(data, time, date, num, player_id):
    date = int(date)
    if time == 'semester':
        new_date = semester_back(date)
        period = get_bigger_than(data, new_date)
    if time == 'year':
        period = get_bigger_than(data, date - 10000)
    if time == 'career':
        period = data
        
    period = before_date_match(period, date, num)
    period_1 = period.loc[period['winner_id']==player_id]
    period_2 = period.loc[period['loser_id']==player_id]
    if not period_1.empty and not period_2.empty:
        return (sum(period_1['w_1stIn']) + sum(period_2['l_1stIn']) ) / (sum(period_1['w_svpt']) + sum(period_2['l_svpt']))
    else:
        return 0


for time in ['semester', 'year', 'career']:
        player0_features[f'percent_svpt_{time}'] = []
        player1_features[f'percent_svpt_{time}'] = []
ind_boost = 0

for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'year', 'career']:
        winner[f'percent_svpt_{time}'].append(get_percent_svpt(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
        loser[f'percent_svpt_{time}'].append(get_percent_svpt(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))
        
for time in ['semester', 'year', 'career']:
        player0_features[f'percent_rtpt_{time}'] = []
        player1_features[f'percent_rtpt_{time}'] = []
ind_boost = 0

for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'year', 'career']:
        winner[f'percent_rtpt_{time}'].append(get_percent_rtpt(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
        loser[f'percent_rtpt_{time}'].append(get_percent_rtpt(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

for time in ['semester', 'year', 'career']:
        player0_features[f'percent_insv_{time}'] = []
        player1_features[f'percent_insv_{time}'] = []
ind_boost = 0

for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, player0_features, player1_features)
    for time in ['semester', 'year', 'career']:
        winner[f'percent_insv_{time}'].append(get_percent_insv(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'winner_id']))
        loser[f'percent_insv_{time}'].append(get_percent_insv(df_tennis, time, df_tennis.at[ind, 'tourney_date'], 
                                           df_tennis.at[ind, 'match_num'], df_tennis.at[ind, 'loser_id']))

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




In [62]:
# create dataframe
keys = player0_features.keys()
print(keys)
p0_d_f = dict()
for feature in list(keys):  
    p0_d_f[f'{feature}_0'] = player0_features[f'{feature}']
        
keys = player1_features.keys()
p1_d_f = dict()
for feature in list(keys):
    p1_d_f[f'{feature}_1'] = player1_features[f'{feature}']
        
player0_df = pd.DataFrame(p0_d_f)
player1_df = pd.DataFrame(p1_d_f)

dict_keys(['wins_game_semester_level1', 'wins_game_semester_level2', 'wins_game_semester_level3', 'wins_game_year_level1', 'wins_game_year_level2', 'wins_game_year_level3', 'wins_game_career_level1', 'wins_game_career_level2', 'wins_game_career_level3', 'losses_game_semester_level1', 'losses_game_semester_level2', 'losses_game_semester_level3', 'losses_game_year_level1', 'losses_game_year_level2', 'losses_game_year_level3', 'losses_game_career_level1', 'losses_game_career_level2', 'losses_game_career_level3', 'age', 'ht', 'hand', 'rank', 'rank_points', 'wins_semester_level1', 'wins_semester_level2', 'wins_semester_level3', 'wins_year_level1', 'wins_year_level2', 'wins_year_level3', 'wins_career_level1', 'wins_career_level2', 'wins_career_level3', 'losses_semester_level1', 'losses_semester_level2', 'losses_semester_level3', 'losses_year_level1', 'losses_year_level2', 'losses_year_level3', 'losses_career_level1', 'losses_career_level2', 'losses_career_level3', 'wins_semester_Hard', 'wins

In [63]:
player_df = pd.concat([player0_df, player1_df],axis=1)
#player_df['age_0'] = normalize(player_df['age_0'])
player_df

Unnamed: 0,wins_game_semester_level1_0,wins_game_semester_level2_0,wins_game_semester_level3_0,wins_game_year_level1_0,wins_game_year_level2_0,wins_game_year_level3_0,wins_game_career_level1_0,wins_game_career_level2_0,wins_game_career_level3_0,losses_game_semester_level1_0,...,inactive_weeks_1,percent_svpt_semester_1,percent_svpt_year_1,percent_svpt_career_1,percent_rtpt_semester_1,percent_rtpt_year_1,percent_rtpt_career_1,percent_insv_semester_1,percent_insv_year_1,percent_insv_career_1
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78549,0,49,171,0,63,242,12,214,757,9,...,26.0,0.639900,0.635254,0.625753,0.362605,0.373617,0.373126,0.617456,0.624104,0.610792
78550,0,0,140,0,0,148,836,1845,2709,0,...,26.0,0.622794,0.617173,0.620370,0.373569,0.375650,0.366346,0.591798,0.607874,0.607933
78551,0,96,176,12,137,270,18,298,663,32,...,26.0,0.651163,0.658926,0.657471,0.375589,0.370202,0.358417,0.588372,0.586938,0.587202
78552,0,43,140,0,43,180,0,67,410,0,...,23.0,0.652122,0.639755,0.634880,0.365942,0.368883,0.355067,0.612059,0.602699,0.616808


# Player percent of wins/losses

#### percentage of wins
for every feature with wins and losses, we will take percent of it

In [64]:
wins_percent0 = dict()
wins_percent1 = dict()

In [65]:
for time in ['semester', 'year', 'career']:
    for level in ['level1', 'level2', 'level3']:
        wins_percent0[f'percent_{time}_{level}_0'] = []
        wins_percent1[f'percent_{time}_{level}_1'] = []


for time in ['semester', 'career']:
    for surface in ['Hard', 'Clay', 'Grass']:
        wins_percent0[f'percent_{time}_{surface}_0'] = []
        wins_percent1[f'percent_{time}_{surface}_1'] = []
            
for time in ['year', 'career']:
    wins_percent0[f'percent_hth_wins_{time}_0'] = []
    wins_percent1[f'percent_hth_wins_{time}_1'] = []
    

wins_percent0[f'percent_tournament'] = []
wins_percent1[f'percent_tournament'] = []

In [66]:
#adding feature
for ind in tqdm(player_df.index): 
    for time in ['semester', 'year', 'career']:
        for level in ['level1', 'level2', 'level3']:
            den = (player_df.at[ind, f'wins_{time}_{level}_0'] + player_df.at[ind, f'losses_{time}_{level}_0'])
            if den != 0:
                wp0 = player_df.at[ind, f'wins_{time}_{level}_0'] / den
            else:
                wp0 = 0
            wins_percent0[f'percent_{time}_{level}_0'].append(wp0)
            
            den = (player_df.at[ind, f'wins_{time}_{level}_1'] + player_df.at[ind, f'losses_{time}_{level}_1'])
            if den != 0:
                wp1 = player_df.at[ind, f'wins_{time}_{level}_1'] / den
            else:
                wp1 = 0
            wins_percent1[f'percent_{time}_{level}_1'].append(wp1)

    for time in ['semester', 'career']:
        for surface in ['Hard', 'Clay', 'Grass']:
            den = (player_df.at[ind, f'wins_{time}_{surface}_0'] + player_df.at[ind, f'losses_{time}_{surface}_0'])
            if den != 0:
                wp0 = player_df.at[ind, f'wins_{time}_{surface}_0'] / den
            else:
                wp0 = 0
            wins_percent0[f'percent_{time}_{surface}_0'].append(wp0)
            
            den = (player_df.at[ind, f'wins_{time}_{surface}_1'] + player_df.at[ind, f'losses_{time}_{surface}_1'])
            if den != 0:
                wp1 = player_df.at[ind, f'wins_{time}_{surface}_1'] / den
            else:
                wp1 = 0  
            wins_percent1[f'percent_{time}_{surface}_1'].append(wp1)

    for time in ['year', 'career']:
        den = (player_df.at[ind, f'hth_wins_{time}_0'] + player_df.at[ind, f'hth_wins_{time}_1'])
        if den != 0:
            wp0 = player_df.at[ind, f'hth_wins_{time}_0'] / den
        else:
            wp0 = 0
        wins_percent0[f'percent_hth_wins_{time}_0'].append(wp0)
        if den != 0:
            wp1 = player_df.at[ind, f'hth_wins_{time}_1'] / den
        else:
            wp1= 0
        wins_percent1[f'percent_hth_wins_{time}_1'].append(wp1)
        
    den = (player_df.at[ind, f'tournament_wins_0'] + player_df.at[ind, f'tournament_losses_0'])
    if den != 0:
        wp0 = player_df.at[ind, f'tournament_wins_0'] / den
    else:
        wp0 = 0
    wins_percent0[f'percent_tournament'].append(wp0)
    
    den = (player_df.at[ind, f'tournament_wins_1'] + player_df.at[ind, f'tournament_losses_1'])
    if den != 0:
        wp1 = player_df.at[ind, f'tournament_wins_1'] / den
    else:
        wp0 = 0
    wins_percent1[f'percent_tournament'].append(wp1)

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




In [67]:
print(wins_percent0,wins_percent1)
perc_wins_p0_df = pd.DataFrame(wins_percent0)
perc_wins_p1_df = pd.DataFrame(wins_percent1)
perc_wins_df = pd.concat([perc_wins_p0_df, perc_wins_p1_df], axis=1)
perc_wins_df

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Unnamed: 0,percent_semester_level1_0,percent_semester_level2_0,percent_semester_level3_0,percent_year_level1_0,percent_year_level2_0,percent_year_level3_0,percent_career_level1_0,percent_career_level2_0,percent_career_level3_0,percent_semester_Hard_0,...,percent_career_level3_1,percent_semester_Hard_1,percent_semester_Clay_1,percent_semester_Grass_1,percent_career_Hard_1,percent_career_Clay_1,percent_career_Grass_1,percent_hth_wins_year_1,percent_hth_wins_career_1,percent_tournament
0,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000
1,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000
2,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000
3,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000
4,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78549,0.000000,0.400000,0.777778,0.00,0.384615,0.656250,0.200000,0.410256,0.474074,0.636364,...,0.523364,0.466667,0.000000,0.666667,0.532710,0.333333,0.451613,0.0,0.50,0.666667
78550,0.000000,0.000000,0.600000,0.00,0.000000,0.565217,0.612903,0.785276,0.807531,0.526316,...,0.466667,0.500000,0.555556,0.000000,0.454545,0.428571,0.000000,0.0,0.00,1.000000
78551,0.000000,0.636364,0.750000,0.20,0.555556,0.709677,0.153846,0.577778,0.630435,0.647059,...,0.612903,0.400000,0.666667,0.700000,0.561644,0.527778,0.450000,0.0,0.25,0.833333
78552,0.000000,0.600000,0.521739,0.00,0.500000,0.457143,0.000000,0.357143,0.466667,0.526316,...,0.474074,0.652174,0.000000,0.666667,0.465649,0.363636,0.500000,0.0,1.00,1.000000


# Player ELO feature
calculate elo of player A as follows $R_A = 1500$ i 
$$R_A' = R_A + K(S_A - E_A)$$
$$E_A = \frac{1}{1 + 10^{(R_B - R_A)/400}}$$

In [68]:
from collections import defaultdict
def d_elo():
    return 1500
elo = defaultdict(d_elo)
elo_feature_0 = {'elo':[]}
elo_feature_1 = {'elo':[]}

In [69]:
def calculate_elo(player1_id, player2_id, score):
    K = 24    
    expected_score = 1 / (1 + 10 **((elo[player2_id] - elo[player1_id])/ 400))
    return elo[player1_id] + K * (score - expected_score)

In [70]:
ind_boost = 0
for ind in tqdm(df_tennis.index): 
    ind_boost,winner,loser = check_target(ind, ind_boost, elo_feature_0, elo_feature_1)
    winner_id = df_tennis.at[ind, 'winner_id']
    loser_id  = df_tennis.at[ind, 'loser_id']
    #add actual elo
    winner[f'elo'].append(elo[winner_id])  
    loser[f'elo'].append(elo[loser_id])
    
    #update elo
    winner_elo = calculate_elo(winner_id, loser_id, 1)
    loser_elo = calculate_elo(loser_id, winner_id, 0)
    elo[winner_id] = winner_elo
    elo[loser_id] = loser_elo

HBox(children=(FloatProgress(value=0.0, max=78554.0), HTML(value='')))




In [71]:
# create dataframe
keys = elo_feature_0.keys()
e0_d_f = dict()
for feature in list(keys):  
    e0_d_f[f'{feature}_0'] = elo_feature_0[f'{feature}']
        
keys = elo_feature_1.keys()
e1_d_f = dict()
for feature in list(keys):
    e1_d_f[f'{feature}_1'] = elo_feature_1[f'{feature}']
    
df_elo_0 = pd.DataFrame(e0_d_f)
df_elo_1 = pd.DataFrame(e1_d_f)

In [72]:
elo_df = pd.concat([df_elo_0, df_elo_1], axis = 1)
elo_df

Unnamed: 0,elo_0,elo_1
0,1500.000000,1500.000000
1,1500.000000,1500.000000
2,1500.000000,1500.000000
3,1500.000000,1500.000000
4,1500.000000,1500.000000
...,...,...
78549,1710.301655,1668.744358
78550,1755.131217,1616.662623
78551,1751.181927,1693.073958
78552,1633.207059,1720.873123


# Macth features

In [73]:
match_features = dict()

## surface: 0, 0.5, 1.0
$$\{surface\}$$
$$\{best\_of\}$$
$$\{match\_rank\}$$
$$\{year\}$$
$$\{sin\_day\}$$
$$\{cos\_day\}$$

In [74]:
surfaces = {'Hard':0, 'Clay':0.5, 'Grass':1.0, "Carpet":1.0}

In [75]:
def round_value(tournament_round): # value of round is points for win in round / points for win in tournament
    tmp_dict = {'F':1.0,'SF':0.6,'QF': 0.36, 'R16':0.18, 'R32': 0.09, 'R64': 0.0045, 'R128': 0.00225, 'RR': 0.00125, 'ER': 0.00125}
    return tmp_dict[tournament_round]

def norm_importance(draw_size,tournament_round) : # tournament is correalted with draw size
    MAX_LOG_DRAW_SIZE = 7
    log_draw_size = np.ceil(np.log2(draw_size))
    result = 1 / 2**(MAX_LOG_DRAW_SIZE - log_draw_size) * round_value(tournament_round)
    return result  

In [76]:
def year(t_date):
    return t_date // 10000
def sin_day(t_date):
    year = t_date // 10000
    month = (t_date // 100) % 100
    day = t_date % 100
    d = date(year,month,day)
    day_num = d.timetuple().tm_yday
    return np.sin(day_num * 2 * np.pi / 365)
def cos_day(t_date):
    year = t_date // 10000
    month = (t_date // 100) % 100
    day = t_date % 100
    d = date(year,month,day)
    day_num = d.timetuple().tm_yday
    return np.cos(day_num * 2 * np.pi / 365)

In [77]:
#adding match feature
for feature in ['surface', 'best_of', 'match_rank', 'year', 'sin_day', 'cos_day']:
    match_features[feature] = []

for ind in df_tennis.index:
    for feature in ['surface', 'best_of', 'match_rank', 'date']:#,
        if feature == 'surface':
            match_features[feature].append(surfaces[df_tennis.at[ind, feature]])
        if feature == 'best_of':
            match_features[feature].append(df_tennis.at[ind, feature])
        if feature == 'match_rank':
            match_features[feature].append(norm_importance(df_tennis.at[ind, 'draw_size'], df_tennis.at[ind, 'round']))
        if feature == 'date':
            match_features['year'].append(year(df_tennis.at[ind,'tourney_date']))
            match_features['sin_day'].append(sin_day(df_tennis.at[ind,'tourney_date']))
            match_features['cos_day'].append(cos_day(df_tennis.at[ind,'tourney_date']))

In [78]:
match_df = pd.DataFrame(match_features)
match_df

Unnamed: 0,surface,best_of,match_rank,year,sin_day,cos_day
0,0.0,3,0.0225,1991,0.120208,0.992749
1,0.0,3,0.0225,1991,0.120208,0.992749
2,0.0,3,0.0225,1991,0.120208,0.992749
3,0.0,3,0.0225,1991,0.120208,0.992749
4,0.0,3,0.0225,1991,0.120208,0.992749
...,...,...,...,...,...,...
78549,0.0,3,0.0900,2021,-0.790946,0.611886
78550,0.0,3,0.0900,2021,-0.790946,0.611886
78551,0.0,3,0.1500,2021,-0.790946,0.611886
78552,0.0,3,0.1500,2021,-0.790946,0.611886


In [79]:
final_df = pd.concat([player_df, elo_df, perc_wins_df, match_df, Y], axis=1)
final_df = final_df.loc[(final_df['match_counter_0'] >= 20) & (final_df['match_counter_1'] >= 20)  ]
final_df

Unnamed: 0,wins_game_semester_level1_0,wins_game_semester_level2_0,wins_game_semester_level3_0,wins_game_year_level1_0,wins_game_year_level2_0,wins_game_year_level3_0,wins_game_career_level1_0,wins_game_career_level2_0,wins_game_career_level3_0,losses_game_semester_level1_0,...,percent_hth_wins_year_1,percent_hth_wins_career_1,percent_tournament,surface,best_of,match_rank,year,sin_day,cos_day,target
2157,46,73,108,46,91,156,46,91,156,8,...,0.0,0.00,1.000000,0.0,3,0.1500,1991,-0.741222,-0.671260,0
2232,0,30,177,0,30,177,0,30,177,6,...,0.0,0.00,1.000000,0.0,5,0.0045,1991,-0.816538,-0.577292,1
2293,0,27,62,0,27,81,0,27,81,7,...,0.0,0.00,0.000000,0.5,3,0.0225,1991,-0.930724,-0.365723,1
2367,0,46,189,0,46,189,0,46,189,6,...,0.5,0.50,1.000000,0.5,3,0.2500,1991,-0.930724,-0.365723,1
2418,0,31,132,0,31,144,0,31,144,16,...,0.0,0.00,0.000000,0.5,3,0.0225,1991,-0.991114,-0.133015,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78549,0,49,171,0,63,242,12,214,757,9,...,0.0,0.50,0.666667,0.0,3,0.0900,2021,-0.790946,0.611886,0
78550,0,0,140,0,0,148,836,1845,2709,0,...,0.0,0.00,1.000000,0.0,3,0.0900,2021,-0.790946,0.611886,1
78551,0,96,176,12,137,270,18,298,663,32,...,0.0,0.25,0.833333,0.0,3,0.1500,2021,-0.790946,0.611886,1
78552,0,43,140,0,43,180,0,67,410,0,...,0.0,1.00,1.000000,0.0,3,0.1500,2021,-0.790946,0.611886,0


In [80]:
final_df.to_csv('atp.csv', index=False)