### This is a program to identify a winner / loser in a tournament in Women's Tennis for all Grand Slams

#### Import all the libraries

In [6]:
import pandas as pd
import logging
import sys
from importlib import reload
import matplotlib as plt
import datetime

In [7]:
logger = logging.getLogger()

logging.basicConfig(format=' %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
reload(logging)

<module 'logging' from '//anaconda3/lib/python3.7/logging/__init__.py'>

In [8]:
cols_of_interest = [ 'year', 'best_of', 'loser_age', 'winner_age', 
                    'draw_size', 'loser_id','loser_ioc', 'loser_name',
                    'loser_rank', 'winner_id', 'winner_name',
                    'winner_ioc','winner_rank','tourney_id', 
                    'tourney_name','tourney_date', 'score']

#### Read data from WTA ( Women's tennis) for last 20 years

In [9]:
#Matches
matches = pd.read_csv("../wta/matches.csv", low_memory=False)

In [10]:
#Get all the players
players = pd.read_csv("../wta/players.csv", sep=',', encoding='latin-1')
players.head(n=10)

Unnamed: 0,player_id,first_name,last_name,hand,birth_date,country_code
0,200001,Martina,Hingis,R,19800930.0,SUI
1,200002,Mirjana,Lucic,R,19820309.0,CRO
2,200003,Justine,Henin,R,19820601.0,BEL
3,200004,Kerry Anne,Guse,R,19721204.0,AUS
4,200005,Jolene,Watanabe Giltz,R,19680831.0,USA
5,200006,Karina,Habsudova,R,19730802.0,SVK
6,200007,Silvija,Talaja,R,19780114.0,CRO
7,200008,Alicia,Molik,R,19810127.0,AUS
8,200009,Tamarine,Tanasugarn,R,19770524.0,THA
9,200010,Rita,Grande,R,19750323.0,ITA


In [11]:
#Get all the qualifying matches
qualifying_matches = pd.read_csv("../wta/qualifying_matches.csv", low_memory=False)

In [12]:
#Get all the Rankings
rankings = pd.read_csv("../wta/rankings.csv")

#### Enrich the matches data

In [13]:
#convert the player ID to an integer
matches['loser_id'] = matches['loser_id'].apply( lambda x: int(x))
matches['winner_id'] = matches['winner_id'].apply( lambda x: int(x))

#here, we are changing all of the N/A loser and winner ages to 0
matches['loser_age'] = matches['loser_age'].fillna(0)
matches['winner_age'] = matches['winner_age'].fillna(0)

#here, we are converting the loser and winner ages to integers
matches['loser_age'] = matches['loser_age'].apply( lambda x: int(x))
matches['winner_age'] = matches['winner_age'].apply( lambda x: int(x))
matches['year'] = matches['tourney_date'].apply( lambda x: str(x)[:4])



# Features

1. number of matches the player has won against the other mention

2. number of matches the player has LOST against the other player

3. The number of matches the player has won in the past year from the date of the search

4. The surface of the court
    - Can figure out which surface the player is best at by finding the number of matches 
    the player has won on each surface in the past year from the date of the search 
5. The age of the player - (older players are prone to have deteriorating play level, health, and stamina, which means
    they are more likely to lose)



In [34]:
def findNextWinner ( matches, player1, player2  ):
    player1_wins =  matches [( matches['winner_name'] == player1) & ( matches['loser_name'] == player2 )] 
    
    curr_year = datetime.datetime.today()
    prev_year = datetime.datetime.today() - datetime.timedelta(days= 365)
    
    player1_win_last_year =  player1_wins [player1_wins['year'] == prev_year] 
    player2_wins =  matches [( matches['winner_name'] == player2) & ( matches['loser_name'] == player1 )] 
    player2_wins_last_year = player2_wins [player2_wins['year']==  prev_year ] 

    player1_win_count = len(matches [( matches['winner_name'] == player1) & ( matches['loser_name'] == player2 )])
    player1_win_last_year_count =   len( player1_wins [player1_wins['year'] == prev_year ] )
    
    player2_win_count = len(matches [( matches['winner_name'] == player2) & ( matches['loser_name'] == player1 )])
    player2_win_last_year_count =  len(player2_wins [player2_wins['year']== prev_year])

    logger.info(" %s has won %s matches || %s has won %s matches" % ( player1, player1_win_count, player2, player2_win_count) ) 
    
    if (player1_win_count == 0 & player2_win_count == 0):
        msg = "Sorry, %s has not played with %s. So, we don't have enough information to predict winner" % (player1, player2) 
        return player1, 0
    
    player1chance = int( 100* (player1_win_count)/(player1_win_count + player2_win_count))
    
    scores = matches [( matches['winner_name'] == player1) & ( matches['loser_name'] == player2 )]['score']
#     logger.info("%s against %s: Scores: \n %s" %( player1, player2, scores ))
    
    logger.info(" In last year, %s has won %s matches || %s has won %s matches" % ( player1, player1_win_last_year_count, player2, player2_win_last_year_count) ) 

    last_year_wins = player1_win_last_year_count + player2_win_last_year_count
    
    if ( last_year_wins !=0 ):
        player1chanceyear = int( ( (100 * player1_win_last_year_count) / (last_year_wins )) ) 
    else:
        player1chanceyear = 0
        logger.info("Sorry, the two players haven't played in past one year.")
        
        if player1chanceyear == 0:
            player1totalchance = player1chance
            
        else:
            player1totalchance = (0.6 * player1chance) + (0.4 * player1chanceyear)
    
    logger.info("Based on this information, %s has a %s percent chance of winning the next match" % (player1, player1totalchance))
    return player1, player1totalchance


In [15]:
def applyScoreWeights():
    logger.info("apply score weights")

In [25]:
def playerOpps(matches, player):
    
    player_win_matches =   matches[ ( matches['winner_name'] == player ) ]
    player_lost_matches =  matches[(matches['loser_name'] == player) ]
    l = player_win_matches[['loser_name']]
    w = player_lost_matches[['winner_name']]
    
    player_opps = list(l.loser_name.unique()) + list(w.winner_name.unique() )
    logger.info("%s has played a total of %d other players" %( player, len(player_opps)))
    return player_opps
    

In [29]:
def commonPlayer(list1, list2):
    list3 = [value for value in list1 if value in list2] 
    logger.info(" There are %s common players." %( len(list3)))
    return list3 
    

In [59]:
player1 = "Serena Williams"
player2 = "Venus Williams"

In [61]:
player1_chance = findNextWinner(matches, player1, player2)
logger.info("player1_chance >.")
if (player1_chance[1] == 0 ):
#     lets a common player
    player1_opps = playerOpps(matches, player1)
    player2_opps = playerOpps(matches, player2)
    common_players = commonPlayer(player1_opps, player2_opps)
    for common_player in common_players:
        logger.info("****************************************\n")
        logger.info("Common Player is: %s" % ( common_player))
        player = findNextWinner(matches, player1, common_player)
        data = findNextWinner(matches, player2, common_player)
else:
    

 INFO :  Serena Williams has won 17 matches || Venus Williams has won 8 matches
 INFO :  In last year, Serena Williams has won 0 matches || Venus Williams has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 68 percent chance of winning the next match


('Serena Williams', 68)

In [46]:
player1_opps = playerOpps(matches, player1)
player2_opps = playerOpps(matches, player2)
logger.info("Player 1  opponents: %s" % ( len(player1_opps)) ) 
common_players = commonPlayer(player1_opps, player2_opps)


 INFO : Simona Halep has played a total of 222 other players
 INFO : Martina Hingis has played a total of 166 other players
 INFO : Player 1  opponents: 222
 INFO :  There are 54 common players.


['Olga Savchuk',
 'Patty Schnyder',
 'Iveta Benesova',
 'Nuria Llagostera Vives',
 'Urszula Radwanska',
 'Alla Kudryavtseva',
 'Greta Arn',
 'Svetlana Kuznetsova',
 'Na Li',
 'Daniela Hantuchova']

In [64]:
player2 = "Martina Hingis"
player1 = "Serena Williams"
for common_player in common_players:
    logger.info("****************************************\n")
    logger.info("Common Player is: %s" % ( common_player))
    player = findNextWinner(matches, player1, common_player)
    data = findNextWinner(matches, player2, common_player)




 INFO : ****************************************

 INFO : Common Player is: Olga Savchuk
 INFO :  Serena Williams has won 0 matches || Olga Savchuk has won 0 matches
 INFO :  Martina Hingis has won 1 matches || Olga Savchuk has won 0 matches
 INFO :  In last year, Martina Hingis has won 0 matches || Olga Savchuk has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Martina Hingis has a 100 percent chance of winning the next match
 INFO : ****************************************

 INFO : Common Player is: Patty Schnyder
 INFO :  Serena Williams has won 6 matches || Patty Schnyder has won 4 matches
 INFO :  In last year, Serena Williams has won 0 matches || Patty Schnyder has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 60 percent chance of winning the next match
 INFO :  Martina Hingis has won 0 matches || Patty Schnyder has won 2 match

 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 71 percent chance of winning the next match
 INFO :  Martina Hingis has won 0 matches || Jelena Jankovic has won 2 matches
 INFO : ****************************************

 INFO : Common Player is: Klara Koukalova
 INFO :  Serena Williams has won 4 matches || Klara Koukalova has won 2 matches
 INFO :  In last year, Serena Williams has won 0 matches || Klara Koukalova has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 66 percent chance of winning the next match
 INFO :  Martina Hingis has won 1 matches || Klara Koukalova has won 0 matches
 INFO :  In last year, Martina Hingis has won 0 matches || Klara Koukalova has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Martina Hingis has a 100 percent chance of winning the

 INFO : Based on this information, Serena Williams has a 68 percent chance of winning the next match
 INFO :  Martina Hingis has won 2 matches || Venus Williams has won 5 matches
 INFO :  In last year, Martina Hingis has won 0 matches || Venus Williams has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Martina Hingis has a 28 percent chance of winning the next match
 INFO : ****************************************

 INFO : Common Player is: Kateryna Bondarenko
 INFO :  Serena Williams has won 3 matches || Kateryna Bondarenko has won 0 matches
 INFO :  In last year, Serena Williams has won 0 matches || Kateryna Bondarenko has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 100 percent chance of winning the next match
 INFO :  Martina Hingis has won 1 matches || Kateryna Bondarenko has won 0 matches
 INFO :  In last year, Martina Hingis 

 INFO : Common Player is: Mathilde Johansson
 INFO :  Serena Williams has won 1 matches || Mathilde Johansson has won 0 matches
 INFO :  In last year, Serena Williams has won 0 matches || Mathilde Johansson has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 100 percent chance of winning the next match
 INFO :  Martina Hingis has won 1 matches || Mathilde Johansson has won 0 matches
 INFO :  In last year, Martina Hingis has won 0 matches || Mathilde Johansson has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Martina Hingis has a 100 percent chance of winning the next match
 INFO : ****************************************

 INFO : Common Player is: Svetlana Kuznetsova
 INFO :  Serena Williams has won 10 matches || Svetlana Kuznetsova has won 3 matches
 INFO :  In last year, Serena Williams has won 0 matches || Svetlana Kuznetsova has w

 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Martina Hingis has a 66 percent chance of winning the next match
 INFO : ****************************************

 INFO : Common Player is: Ana Ivanovic
 INFO :  Serena Williams has won 9 matches || Ana Ivanovic has won 1 matches
 INFO :  In last year, Serena Williams has won 0 matches || Ana Ivanovic has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 90 percent chance of winning the next match
 INFO :  Martina Hingis has won 1 matches || Ana Ivanovic has won 1 matches
 INFO :  In last year, Martina Hingis has won 0 matches || Ana Ivanovic has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Martina Hingis has a 50 percent chance of winning the next match
 INFO : ****************************************

 INFO : Common Player is: Sara Err

In [35]:
player2 = "Martina Hingis"
player1 = "Serena Williams"
data = findNextWinner(matches, player1, player2)
logger.info("data >>> %s", data )


 INFO :  Serena Williams has won 4 matches || Martina Hingis has won 3 matches
 INFO :  In last year, Serena Williams has won 0 matches || Martina Hingis has won 0 matches
 INFO : Sorry, the two players haven't played in past one year.
 INFO : Based on this information, Serena Williams has a 57 percent chance of winning the next match
 INFO : data >>> ('Serena Williams', 57)


In [10]:
cols_of_interest = [ 'year', 'best_of', 'loser_age', 'winner_age', 
                    'draw_size',  'loser_name',
                    'loser_rank', 'winner_id', 'winner_name',
                    'winner_ioc','winner_rank','tourney_id', 
                    'tourney_name','tourney_date', 'score']

player1 = 'Serena Williams'
player2 = 'Maria Sharapova'
match_stats = matches[ ( (( matches['winner_name'] == player1) | 
           ( matches['loser_name'] == player1) ) &
          ( ( matches['loser_name'] == player2)  |
          ( matches['winner_name'] == player2) ))][cols_of_interest]


In [11]:
#One Player perspective
features =  ['date', '# of Matches Won', '# of Matches Lost', "year", 'Surface Type', 'age', 'Scores' ]

#
featuresToWeights = {
            "WinCounts" : 60,
            "LastYear" : 40,
            }




### Data visualization

We will visualize some of the data - eg, who has won most of the grand slams, who is second etc 

In [89]:
matches.head()

Unnamed: 0,best_of,draw_size,loser_age,loser_entry,loser_hand,loser_ht,loser_id,loser_ioc,loser_name,loser_rank,...,winner_hand,winner_ht,winner_id,winner_ioc,winner_name,winner_rank,winner_rank_points,winner_seed,year,Unnamed: 32
0,3,128,17,,R,,200002,CRO,Mirjana Lucic,49.0,...,R,170.0,200001,SUI,Martina Hingis,1.0,6003.0,1.0,2000,
1,3,128,27,Q,R,,200004,AUS,Kerry Anne Guse,133.0,...,R,167.0,200003,BEL,Justine Henin,63.0,510.0,,2000,
2,3,128,31,,R,,200005,USA,Jolene Watanabe Giltz,118.0,...,R,,200006,SVK,Karina Habsudova,53.0,574.0,,2000,
3,3,128,22,,R,,200007,CRO,Silvija Talaja,23.0,...,R,182.0,200008,AUS,Alicia Molik,116.0,245.0,,2000,
4,3,128,24,,R,,200010,ITA,Rita Grande,60.0,...,R,165.0,200009,THA,Tamarine Tanasugarn,72.0,439.0,,2000,


In [36]:
curr_year = datetime.datetime.today()
prev_year = datetime.datetime.today() - datetime.timedelta(days= 365)

prev_year.year


2019