In [1]:
library(readr)

: package 'readr' was built under R version 3.1.3

In [2]:
#read the mens results from the files into data frames
aus_men=read_csv('AusOpen-men-2013.csv')
fr_men=read_csv('FrenchOpen-men-2013.csv')
wimb_men=read_csv('Wimbledon-men-2013.csv')
us_men=read_csv('USOpen-men-2013.csv')

In [3]:
#i'll be merging them all into a single dataframe later, 
#so adding a new column GS to indicate which grand slam it is.
aus_men$GS=rep(1,nrow(aus_men))
fr_men$GS=rep(2,nrow(fr_men))
wimb_men$GS=rep(3,nrow(wimb_men))
us_men$GS=rep(4,nrow(us_men))

In [4]:
#similarly read the women's results, & add the new GS column
aus_women=read_csv('AusOpen-women-2013.csv')
fr_women=read_csv('FrenchOpen-women-2013.csv')
wimb_women=read_csv('Wimbledon-women-2013.csv')
us_women=read_csv('USOpen-women-2013.csv')

aus_women$GS=rep(1,nrow(aus_women))
fr_women$GS=rep(2,nrow(fr_women))
wimb_women$GS=rep(3,nrow(wimb_women))
us_women$GS=rep(4,nrow(us_women))

In [5]:
##DATA CLEANUP

#column names are not uniform across the 4 files. 
#Renaming columns FNL1 & FNL2 to FNL.1 & FNL.2 for Aussie & US Open
colnames(aus_men)[colnames(aus_men) == c("FNL1","FNL2")] <- c("FNL.1","FNL.2")
colnames(us_men)[colnames(us_men) == c("FNL1","FNL2")] <- c("FNL.1","FNL.2")

colnames(aus_women)[colnames(aus_women) == c("FNL1","FNL2")] <- c("FNL.1","FNL.2")
colnames(us_women)[colnames(us_women) == c("FNL1","FNL2")] <- c("FNL.1","FNL.2")

#US Open women's file has spaces in between, & ROUND is all capital. Need to fix this 
colnames(us_women)[colnames(us_women) == c("Player 1","Player 2","ROUND")] <- c("Player1","Player2","Round")

#renaming column 38 from ST2.1 to ST1.2 in Women's Wimbledon & US Open files.
colnames(us_women)[38]="ST1.2"
colnames(wimb_women)[38]="ST1.2"

#merge all mens results into a new data frame men_combined
#similarly women's results into women_combined
men_combined=rbind(aus_men,fr_men,wimb_men,us_men)
women_combined=rbind(aus_women,fr_women,wimb_women,us_women)

In colnames(us_women) == c("Player 1", "Player 2", "ROUND"): longer object length is not a multiple of shorter object length

In [6]:
#Break Point chances should be >= Break Points won.
#Net Points attempted should be >= Net Points won.
#This isn't the case for men's Aussie, French & US open.
#& Women's Aussie & French Open.
#Must've been an honest mistake by the data collector.
#Gonna fix it by swapping the corresponding columns.
men_combined[men_combined$GS %in% c(1,2,4), 
             c("BPC.1", "BPW.1","NPA.1","NPW.1","BPC.2","BPW.2","NPA.2","NPW.2")] <- 
  men_combined[men_combined$GS %in% c(1,2,4) , 
               c("BPW.1", "BPC.1","NPW.1","NPA.1","BPW.2","BPC.2","NPW.2","NPA.2")]

women_combined[women_combined$GS %in% c(1,2), 
               c("BPC.1", "BPW.1","NPA.1","NPW.1","BPC.2","BPW.2","NPA.2","NPW.2")] <- 
  women_combined[women_combined$GS %in% c(1,2) , 
                 c("BPW.1", "BPC.1","NPW.1","NPA.1","BPW.2","BPC.2","NPW.2","NPA.2")]


In [7]:
#Non uniform player-name formats across the files.
#Wimbledon uses "N.Djokovic" format, whereas all others use "Novak Djokovic" format.
#For uniformity & simplicity, I shall convert all to only second name - i.e "Djokovic" format.

for (i in 1:nrow(men_combined))
{
  a=unlist(strsplit(men_combined$Player1[i],"[ ]"))     #split Player1 based on space
  if(length(a)==1)                                      
    a=unlist(strsplit(men_combined$Player1[i],"[.]"))   #split based on ".", if previous split had no effect
  
  if (a[length(a)] == "Mayer")                          #Take special care for surname "Mayer". Use first name (Leandro/Florian) to differentiate
  {
    if (a[length(a)-1]=="L")
      men_combined$Player1[i]="Leonardo"
    else if(a[length(a)-1]=="F")
      men_combined$Player1[i]="Florian"
    else
      men_combined$Player1[i]=a[length(a)-1]
  }
  else if (a[length(a)] %in% c("Jr","Jr."))             #Special care for Alex Bogomolov Jr.
    men_combined$Player1[i]="Bogomolov"
  else
    men_combined$Player1[i]=a[length(a)]
  
  b=unlist(strsplit(men_combined$Player2[i],"[ ]"))     #repeat similarly for Player2
  if(length(b)==1)
    b=unlist(strsplit(men_combined$Player2[i],"[.]"))
  
  if (b[length(b)] == "Mayer")
  {
    if (b[length(b)-1]=="L")
      men_combined$Player2[i]="Leonardo"
    else if(b[length(b)-1]=="F")
      men_combined$Player2[i]="Florian"
    else
      men_combined$Player2[i]=b[length(b)-1]
  }
  else if (b[length(b)] %in% c("Jr","Jr."))
    men_combined$Player2[i]="Bogomolov"
  else
    men_combined$Player2[i]=b[length(b)]
  
}

In [8]:
for (i in 1:nrow(women_combined))                         #Similar procedure for the Women.
{
  a=unlist(strsplit(women_combined$Player1[i],"[ ]"))
  if(length(a)==1)
    a=unlist(strsplit(women_combined$Player1[i],"[.]"))
  
  if (a[length(a)] == "Williams")                         #Special care for Williams - (Serena/Venus)
  {
    if (a[length(a)-1]=="S")
      women_combined$Player1[i]="Serena"
    else if(a[length(a)-1]=="V")
      women_combined$Player1[i]="Venus"
    else
      women_combined$Player1[i]=a[length(a)-1]
  }
  else if (a[length(a)] == "Pliskova")                   #Special care for Pliskova - (Kristina/Karolina)
  {
    if (a[length(a)-1]=="Kr")
      women_combined$Player1[i]="Kristina"
    else if(a[length(a)-1] %in% c("K","Ka"))
      women_combined$Player1[i]="Karolina"
    else
      women_combined$Player1[i]=a[length(a)-1]
  } 
  else
    women_combined$Player1[i]=a[length(a)]
  
  b=unlist(strsplit(women_combined$Player2[i],"[ ]"))
  if(length(b)==1)
    b=unlist(strsplit(women_combined$Player2[i],"[.]"))
  
  if (b[length(b)] == "Williams")
  {
    if (b[length(b)-1]=="S")
      women_combined$Player2[i]="Serena"
    else if(b[length(b)-1]=="V")
      women_combined$Player2[i]="Venus"
    else
      women_combined$Player2[i]=b[length(b)-1]
  }
  else if (b[length(b)] == "Pliskova")
  {
    if (b[length(b)-1]=="Kr")
      women_combined$Player2[i]="Kristina"
    else if(b[length(b)-1] %in% c("K","Ka"))
      women_combined$Player2[i]="Karolina"
    else
      women_combined$Player2[i]=b[length(b)-1]
  } 
  else
    women_combined$Player2[i]=b[length(b)]
  
}

In [9]:
#save the cleaned data into files, for ease in future
write_csv(men_combined,"men_combined.csv")
write_csv(women_combined,"women_combined.csv")

In [10]:
#print the final match of every grandslam
aus_men[aus_men$Round==7,c(1:4)]
fr_men[fr_men$Round==7,c(1:4)]
wimb_men[wimb_men$Round==7,c(1:4)]
us_men[us_men$Round==7,c(1:4)]

aus_women[aus_women$Round==7,c(1:4)]
fr_women[fr_women$Round==7,c(1:4)]
wimb_women[wimb_women$Round==7,c(1:4)]
us_women[us_women$Round==7,c(1:4)]
#Woah!!! Wawrinka beating Nadal & Li Na beating Cibulkova?!?
#This is actually Aussie Open 2014 data instead of 2013.
#everything else seems right.

Unnamed: 0,Player1,Player2,Round,Result
126,Rafael Nadal,Stanislas Wawrinka,7,0


Unnamed: 0,Player1,Player2,Round,Result
125,Rafael Nadal,David Ferrer,7,1


Unnamed: 0,Player1,Player2,Round,Result
114,N.Djokovic,A.Murray,7,0


Unnamed: 0,Player1,Player2,Round,Result
126,Novak Djokovic,Rafael Nadal,7,0


Unnamed: 0,Player1,Player2,Round,Result
127,Na Li,Dominika Cibulkova,7,1


Unnamed: 0,Player1,Player2,Round,Result
127,Serena Williams,Maria Sharapova,7,1


Unnamed: 0,Player1,Player2,Round,Result
122,S.Lisicki,M.Bartoli,7,0


Unnamed: 0,Player1,Player2,Round,Result
1,S Williams,V Azarenka,7,1


In [11]:
##HYPOTHESES & ANALYSIS

## Hypothesis 1
## Players who win an intense 5 set match, have a high probability of losing in their next round match.
## (fatigue from the previous match is a cause for defeat)
## Method - 
## Find all 5 set matches. Set the won_next counter to 0 
## Find the winner of each of these matches.
## Find the outcome of this player's next match, & increment the counter if he's won it.
## Finally output the ratio.
## if ratio is close to 0.5, our theory is false.
## if it is much less than 0.5, our theory holds true.


In [12]:
five_set=men_combined[is.na(men_combined$ST5.1)==F,]      #the ST5 column for player1 will be non-empty only in case of 5 set matches.
won_next=0                                                #counter

for (i in 1:nrow(five_set))
{
  #find winning player's name
  winner=ifelse (five_set$Result[i],five_set$Player1[i],five_set$Player2[i]) 
  #find the row corresponding to his next match
  next_match=men_combined[(men_combined$Player1==winner | men_combined$Player2==winner) & 
                            men_combined$GS==five_set$GS[i] & 
                            men_combined$Round==(five_set$Round[i]+1),]
  
  #if this match exists, then find its winner
  if(nrow(next_match)==1)
  {
    winner2=ifelse (next_match$Result,next_match$Player1,next_match$Player2)
    if (winner==winner2)
      won_next=won_next+1
  }
  #if the match doesn't exist, it either means--
  #1)the player forfeited his next round match. (treat this as a loss)
  #or 2)his opponent forfeited the match. (treat this as a win)
  else
  {
    next_match=men_combined[(men_combined$Player1==winner | men_combined$Player2==winner) & 
                              men_combined$GS==five_set$GS[i] & 
                              men_combined$Round==(five_set$Round[i]+2),]
    if (nrow(next_match)==1)
      won_next=won_next+1
  }
}

In [13]:
#output the results
cat ("won_next=")
won_next
cat ("ratio=")
won_next/nrow(five_set)
## The winner of a 5 set match wins his next match only 33.7% of the time.
## Our theory holds true.

won_next=

ratio=

In [14]:
## Hypothesis 2
## After the end of the 4 grand slams, the aggregate of the 'Breaks in Favour' for each player
## should roughly approximate to the player's rank.
## Definition of 'Breaks in Favour' (BIF) :-
## if a match of A vs B has 10 breaks of serve, with A breaking B 7 times & B breaking A 3 times,
## then BIF for A is 7/10, & for B is 3/10.
## Basically, a player with high BIF over the course of 4 grand slams should have a high rank
## & player with low BIF should have a low rank.

In [16]:
all_players_men=unique(c(men_combined$Player1,men_combined$Player2))         #list of players (men)
all_players_women=unique(c(women_combined$Player1,women_combined$Player2))   #list of players (women)

print (all_players_men)
print ("______________________________________________________")
print (all_players_women)

  [1] "Lacko"          "Leonardo"       "Baghdatis"      "Tursunov"      
  [5] "Monaco"         "Giraldo"        "Sela"           "Fognini"       
  [9] "Guez"           "Davydenko"      "Busta"          "Robredo"       
 [13] "Groth"          "Mahut"          "Falla"          "Wawrinka"      
 [17] "Gonzalez"       "Johnson"        "Berankis"       "Chardy"        
 [21] "Thompson"       "Ramos"          "Florian"        "Youzhny"       
 [25] "Garcia-Lopez"   "Berlocq"        "Thiem"          "Anderson"      
 [29] "Karlovic"       "Hajek"          "Wu"             "Berdych"       
 [33] "Duckworth"      "Stepanek"       "Gabashvili"     "Verdasco"      
 [37] "Brands"         "Cilic"          "Bellucci"       "Tsonga"        
 [41] "Klizan"         "Rola"           "Przysiezny"     "Robert"        
 [45] "Devvarman"      "Berrer"         "Millot"         "Murray"        
 [49] "Williams"       "Agut"           "Kyrgios"        "Paire"         
 [53] "Klahn"          "Wang"         

In [17]:
BIFfunction<-function(players,matches)
{
  df=data.frame(name=NA,BIF=NA)
  for (name in players)
  {
    lhs=matches[matches$Player1==name,]                       #subset of matches where our desired player was Player1
    bif_lhs=sum(lhs$BPW.1/(lhs$BPW.1+lhs$BPW.2))
    rhs=matches[matches$Player2==name,]                       #subset of matches where he/she was Player2
    bif_rhs=sum(rhs$BPW.2/(rhs$BPW.1+rhs$BPW.2))
    BIF=(bif_lhs+bif_rhs)/(nrow(lhs)+nrow(rhs))
    df=rbind(df,data.frame(name,BIF)) 
  }
  df=df[-1,]                                                  #remove dummy first row
  df[is.na(df)]<-0                                            #if there's any missing data or 0/0 NaNs created. replace them with 0
  df=df[order(df$BIF,decreasing = T),]                        #sort the players in decreasing order of BIF
  return (df)
}


In [18]:
##compute the BIF for all men & women, by applying the BIFfunction
men_bif=BIFfunction(all_players_men,men_combined)
women_bif=BIFfunction(all_players_women,women_combined)

In [20]:
##Print the output BIF dataframes
row.names(men_bif)<-1:length(all_players_men)
men_bif
## I wish I could find a source from which I could download the actual rankings after Aussie Open 2014,
## to do a correlation test between actual ranking & BIF ranking. 
## However by visual inspection, I felt that barring a few outliers, most good players are at the top of the list,
## & poor players at the bottom.
## Interestingly, Ferrer is 15th despite being runner up at French Open; Gasquet is 24th despite being semi-finalist at US Open.

Unnamed: 0,name,BIF
1,Nadal,0.78826
2,Federer,0.7543417
3,Potro,0.7530159
4,Djokovic,0.7486705
5,Murray,0.7363764
6,Tsonga,0.7200397
7,Wawrinka,0.6952601
8,Berdych,0.6920139
9,Cilic,0.689064
10,Rola,0.6875


In [21]:
row.names(women_bif)<-1:length(all_players_women)
women_bif
## Unfortunately, there are way too many outliers in the women's game, showing just how unpredictable it is.
## notable observation - Serena's lead on the top (not surprising)
## some causes for outliers at the top (Cetkovska/Rogers/Birnerova in women's; Rola/Berrer in men's):-
## these players played only 1 GS, won their 1st round match convincingly, & either lost narrowly or withdrew from the 2nd round match.


Unnamed: 0,name,BIF
1,Serena,0.8504844
2,Cetkovska,0.7595238
3,Rogers,0.75
4,Birnerova,0.75
5,Li,0.7081999
6,Safarova,0.7018519
7,Dellacqua,0.6785714
8,Azarenka,0.677884
9,Cibulkova,0.6774291
10,Stosur,0.6685354


In [22]:
## Hypothesis 3
## To build a ranking formula for all players based on their performance in the 1st 3 grand slams (FO13,Wimb13 & USO13) 
## & using this ranking to predict the outcome of every match in AO14. (predicted winner is the player ranked higher)
## compare our predictions with the actual results of AO14, & see if we're doing better than random guess benchmark.
##
## Proposed formula:
## total= FSP + ACE/2 + BPW/2 + BPS/2 + BIF + WINP + NP/2 + MPBonus
## FSP = First serve percentage 
## ACE = ACE percentage, i.e Aces/ (Aces+ double faults) * 100
## BPW = Break points won percentage. i.e BPwon/BPchances * 100
## BPS = Break points saved percentage. i.e, opponent's (BPChances-BPwon)/Opponent's BPchances * 100
## BIF = Breaks in Favour (as described previously) * 100
## WINP = Winners percentage. i.e, Winners/(Winners+ Unforced errors)
## NP = Net points percentage. i.e, Net points won/ net points attempted.
## MPBonus = Bonus awarded depending on the number of matches played. 
## (More matches played -> advanced further in the tournament -> Offering some credit for their consistency)
## (This reduces the impact of outliers who've played only 1 or 2 matches, & can have high values for all stats.)
## In the formula, ACE, BPW, BPS & NP are weighted to be only half as important as FSP, WINP & BIF
## This is achieved by intuition & trial-&-error.
## It reduces the benifit of big servers, & those who approach the net more often.
## Also, BIF captures some of the info from BPS & BPW already, hence lower weightage for those 2.


In [23]:
# First, replace all missing values with 0
men_combined[is.na(men_combined)]<-0
women_combined[is.na(women_combined)]<-0
women_combined[,25]<-as.integer(women_combined[,25])    #This column was of character type for some reason.


In [24]:
#get the train & test datasets for both men & women
#train :- Fr,wimb & Us open (i.e GS 2,3,4)
#test :- Aus open (i.e GS 1)
men_train=men_combined[men_combined$GS>1,]
men_test=men_combined[men_combined$GS==1,]
women_train=women_combined[women_combined$GS>1,]
women_test=women_combined[women_combined$GS==1,]

In [25]:
#find the list of players playing in the Aus Open. (We need to score & rank only these players by looking at their past stats)
men_playing=unique(c(men_test$Player1,men_test$Player2))
women_playing=unique(c(women_test$Player1,women_test$Player2))


In [26]:
#define scoring function
ScoringFunction<-function(players,matches)
{
  stats_df=data.frame(name=NA,FSP=0,ACE=0,BPW=0,BPS=0,BIF=0,WINP=0,NP=0,MPBonus=0,total=0)
  for(name in players)
  {
    lhs=matches[matches$Player1==name,]
    rhs=matches[matches$Player2==name,]
    denominator=nrow(lhs)+nrow(rhs)
    FSP=0
    BIF=0
    WINP=0
    ACE=0
    BPW=0
    BPS=0
    NP=0
    MPBonus=0
    total=0
    if(denominator>0)                     #data for this player exists in the training set 
    {                                     #if data doesn't exist in training set (wildcard player), initialize him with default values (0)
      FSP=(sum(lhs$FSP.1)+sum(rhs$FSP.2))/denominator
      BIF=(sum(lhs$BPW.1/(lhs$BPW.1+lhs$BPW.2))+sum(rhs$BPW.2/(rhs$BPW.1+rhs$BPW.2)))/denominator * 100
      MPBonus=denominator * 10                         #a score of 10 is awarded for every match he's played
      
      if (nrow(lhs)>0)                                 #enters this block only if he's been Player1 at least once
      {
        for(i in 1:nrow(lhs))
        {
          if((lhs$ACE.1[i]+ lhs$DBF.1[i])!=0 )         #Checking for non-zero denominator, so that we avoid NaNs
            ACE= ACE + (lhs$ACE.1[i]/(lhs$ACE.1[i]+lhs$DBF.1[i]))
          
          if(lhs$BPC.1[i]!=0)
            BPW=BPW+ (lhs$BPW.1[i]/lhs$BPC.1[i])
          
          if(lhs$BPC.2[i]!=0)
            BPS=BPS+ ((lhs$BPC.2[i]-lhs$BPW.2[i])/lhs$BPC.2[i])
          else                                        #in case of 0 denominator, It means opponent never had break point chance.
            BPS=BPS+ 1                                #Hence break points saved = 100%
          
          if(lhs$NPA.1[i]!=0)
            NP=NP+ (lhs$NPW.1[i]/lhs$NPA.1[i])
          
          if ((lhs$WNR.1[i]+lhs$UFE.1[i])>0)
            WINP=WINP+(lhs$WNR.1[i]/(lhs$WNR.1[i]+lhs$UFE.1[i]))
          else                                        #if there is no data about winners & unforced errors
            WINP=WINP+0.5                             #assume ratio to be 50%, i.e, equal winners & unforced errors
          
        }
      }
      if(nrow(rhs)>0)                                 #enters this lock only if he's been Player2 at least once. Same calculations as before.
      {
        for(i in 1:nrow(rhs))
        {
          if((rhs$ACE.2[i]+ rhs$DBF.2[i])!=0)
            ACE= ACE + (rhs$ACE.2[i]/(rhs$ACE.2[i]+rhs$DBF.2[i]))
           
          if(rhs$BPC.2[i]!=0)
            BPW=BPW+ (rhs$BPW.2[i]/rhs$BPC.2[i])
          
          if(rhs$BPC.1[i]!=0)
            BPS=BPS+ ((rhs$BPC.1[i]-rhs$BPW.1[i])/rhs$BPC.1[i])
          else
            BPS=BPS+ 1
          
          if(rhs$NPA.2[i]!=0)
            NP=NP+ (rhs$NPW.2[i]/rhs$NPA.2[i])
          
          if ((rhs$WNR.2[i]+rhs$UFE.2[i])>0)
            WINP=WINP+(rhs$WNR.2[i]/(rhs$WNR.2[i]+rhs$UFE.2[i]))
          else
            WINP=WINP+0.5
        }
      }
      ACE=ACE/denominator * 50
      BPW=BPW/denominator * 50
      BPS=BPS/denominator * 50
      NP=NP/denominator * 50                         
      WINP=WINP/denominator * 100
      total=FSP+ACE+BPW+BPS+BIF+WINP+MPBonus+NP
    }
    row=data.frame(cbind(name,FSP,ACE,BPW,BPS,BIF,WINP,NP,MPBonus,total))
    stats_df=rbind(stats_df,row)
    
  }
  
  stats_df=stats_df[-1,]                  
  return (stats_df)

}

In [27]:
#obtain stats data frames for training sets by calling the scoring function
men_stats=ScoringFunction(men_playing,men_train)
women_stats=ScoringFunction(women_playing,women_train)

In [29]:
#sort in descending order of total score, & print to console
men_sorted=men_stats[order(men_stats$total, decreasing=T),]
women_sorted=women_stats[order(women_stats$total, decreasing=T),]

row.names(men_sorted)<-1:nrow(men_sorted)
men_sorted[,c(1,10)]                    #print only Name & Total score column

row.names(women_sorted)<-1:nrow(women_sorted)
women_sorted[,c(1,10)]

Unnamed: 0,name,total
1,Djokovic,510.950578834209
2,Nadal,496.699122065527
3,Ferrer,454.74317635964
4,Murray,452.119574119122
5,Federer,444.990854510701
6,Robredo,431.199862552224
7,Gasquet,428.152612344034
8,Potro,425.88366961053
9,Haas,420.182475874461
10,Wawrinka,418.831484161428


Unnamed: 0,name,total
1,Serena,519.574036472059
2,Radwanska,464.475191835264
3,Li,434.412805195146
4,Azarenka,430.946122982757
5,Lisicki,426.766832790172
6,Stephens,412.833897181763
7,Navarro,398.948363764715
8,Sharapova,390.805329935089
9,Stosur,388.243271924466
10,Kerber,385.72064937946


In [30]:
#Predict function to get 0/1 output for each match in test set, i.e Aus Open
#1-> win for Player1, 0-> Win for Player2
PredictFunction<-function(players,stats,test)
{
  l=length(players)                              #number of players playing
  pred_matrix=matrix(rep(0,l*l),nrow=l,ncol=l)   #l*l matrix, for each possible permutation
  for (i in 1:l)
    for(j in 1:l)
      pred_matrix[i,j]=ifelse(stats[i,10]>=stats[j,10],1,0)
                                                 #output is 1 if i has a greater score than j, 0 otherwise
  pred_vector=rep(0,nrow(test))
  for (i in 1:nrow(test))
  {                                              #look at test set for matches that actually occured
    a=match(test$Player1[i],players)             #find the indexes for the 2 players
    b=match(test$Player2[i],players)             
    pred_vector[i]=pred_matrix[a,b]              #retrieve the corresponding value from the prediction matrix
  }
  
  return (pred_vector)
}

In [31]:
#get the predicted vectors
pred_men=PredictFunction(men_playing,men_stats,men_test)
pred_women=PredictFunction(women_playing,women_stats,women_test)


In [32]:
#print the confusion matrix
table(men_test$Result,pred_men)

   pred_men
     0  1
  0 46 13
  1 20 47

In [33]:
table(women_test$Result,pred_women)

   pred_women
     0  1
  0 41 20
  1 15 51

In [34]:
##turns out we get 92 matches right for both men & women (73% accuracy)
##much better than random coin toss, or just predicting 1 always.
##Our ranking algorithm ain't so bad after all.