In [17]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from scipy import stats
import matplotlib.pyplot as plt

import sklearn.preprocessing

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv("atp.csv")
Y = pd.DataFrame(data['target'])
X = data.drop(['target'], axis = 1)

In [3]:
Y_delete_nans = pd.DataFrame(data.dropna()['target'])

In [4]:
X

Unnamed: 0,age_0,ht_0,hand_0,rank_0,rank_points_0,wins_semester_level1_0,wins_semester_level2_0,wins_semester_level3_0,wins_year_level1_0,wins_year_level2_0,...,percent_insv_career,percent_svpt_semester.1,percent_svpt_year.1,percent_svpt_career.1,percent_rtpt_semester.1,percent_rtpt_year.1,percent_rtpt_career.1,percent_insv_semester.1,percent_insv_year.1,percent_insv_career.1
0,25.790554,190.0,1,4.0,2606.0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,31.099247,180.0,0,55.0,814.0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,20.377823,175.0,1,123.0,374.0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,18.384668,185.0,1,64.0,749.0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,23.852156,188.0,1,221.0,185.0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56697,31.463381,175.0,1,26.0,1917.0,0,1,10,2,5,...,,0.655679,0.644116,0.637888,0.359827,0.364207,0.353557,0.617700,0.605112,0.617767
56698,34.485969,190.0,1,143.0,476.0,0,0,14,0,0,...,0.607300,0.640039,0.637522,0.649474,0.378418,0.373239,0.416364,0.578665,0.579965,0.581257
56699,22.568104,185.0,0,18.0,2348.0,1,6,12,1,8,...,0.586860,0.673152,0.671088,0.658251,0.367237,0.369136,0.361787,0.629053,0.631745,0.637145
56700,23.800137,188.0,1,41.0,1422.0,0,5,16,0,6,...,0.617905,0.627477,0.623343,0.623300,0.378788,0.380130,0.370959,0.589640,0.605187,0.606492


## naive classifier using ranking points

In [5]:
def naive_classifier(column, data, Y):
    acc = 0
    for ind in data.index:
        r0 = data.at[ind, f'{column}_0']
        r1 = data.at[ind, f'{column}_1']
        if r0 > r1 and Y.at[ind, 'target'] == 0:
            acc += 1
        if r0 < r1 and Y.at[ind, 'target'] == 1:
            acc += 1
    return acc/len(data)

In [6]:
print("accuracy: ", naive_classifier('rank_points', X, Y))

accuracy:  0.652551938203238


## naive classifier using elo points

In [7]:
print("accuracy: ", naive_classifier('elo', X, Y))

accuracy:  0.6622341363620331


#### fairly good results, we will try to improve them

In [8]:
min_max_scaler = sklearn.preprocessing.MinMaxScaler()
standard_scaler = sklearn.preprocessing.StandardScaler().fit(data)
X_scaledmx = pd.DataFrame(min_max_scaler.fit_transform(X), columns = X.columns)
X_scaledstd = pd.DataFrame(standard_scaler.fit_transform(X), columns = X.columns)

In [9]:
print("accuracy min_max rank: ", naive_classifier('rank_points', X_scaledmx, Y))
print("accuracy standard rank: ", naive_classifier('rank_points', X_scaledstd, Y))
print("accuracy min_max elo: ", naive_classifier('elo', X_scaledmx, Y))
print("accuracy standard elo: ", naive_classifier('elo', X_scaledstd, Y))

accuracy min_max rank:  0.652551938203238
accuracy standard rank:  0.6531339282564989
accuracy min_max elo:  0.6627984903530739
accuracy standard elo:  0.6632746640330147


## dealing with nulls

In [10]:
# we replace Nans with random gaussian data
def generate_data_on_Nan(column, data):
    m = data[column].mean()
    std = data[column].std()
    #print(m, std)
    for ind in data.index:
        if np.isnan(data.at[ind, column]):
            data.at[ind, column] = np.random.normal(m, std)

In [11]:
# check where are Nans, and how many
for c in X.columns:
    if X[c].isna().sum() > 0:
        print(c, X[c].isna().sum())

ht_0 1503
rank_0 76
rank_points_0 76
minutes_played_last_match_0 701
ht_1 1572
rank_1 92
rank_points_1 92
minutes_played_last_match_1 709
percent_svpt_semester 3714
percent_svpt_year 6453
percent_svpt_career 23281
percent_rtpt_semester 3714
percent_rtpt_year 6453
percent_rtpt_career 23281
percent_insv_semester 3714
percent_insv_year 6453
percent_insv_career 23281
percent_svpt_semester.1 3605
percent_svpt_year.1 6457
percent_svpt_career.1 23308
percent_rtpt_semester.1 3605
percent_rtpt_year.1 6457
percent_rtpt_career.1 23308
percent_insv_semester.1 3605
percent_insv_year.1 6457
percent_insv_career.1 23308


In [12]:
for i,X_scaled in enumerate([X_scaledmx, X_scaledstd]):
    if i == 0:
        print("\nMIN MAX")
    else:
        print("\nSTANDARD")
    X_scaled_2 = X_scaled.copy()
    X_scaled_2 = X_scaled_2.dropna()

    X_scaled_1 = X_scaled.copy()
    # check where are Nans and replace them with gaussian 
    for c in X_scaled.columns:
        if X_scaled[c].isna().sum() > 0:
            #print(c, X_scaled[c].isna().sum())
            generate_data_on_Nan(c, X_scaled_1)
    print("accuracy generate rank: ", naive_classifier('rank_points', X_scaled_1, Y))
    print("accuracy delete rank: ", naive_classifier('rank_points', X_scaled_2, Y_delete_nans))
    print("accuracy generate elo: ", naive_classifier('elo', X_scaled_1, Y))
    print("accuracy delete elo: ", naive_classifier('elo', X_scaled_2, Y_delete_nans))


MIN MAX
accuracy generate rank:  0.6536982822475398
accuracy delete rank:  0.6281562026242877
accuracy generate elo:  0.6627984903530739
accuracy delete elo:  0.6330702075382927

STANDARD
accuracy generate rank:  0.6545095411096611
accuracy delete rank:  0.6287312457525224
accuracy generate elo:  0.6632746640330147
accuracy delete elo:  0.6330179308902714


## so far our best results are: (nulls does not affect elo score)
## elo: 0.6633 standard normalization
## rank: 0.65436 standard normalization, generating rank

# Results on ML classifiers

In [13]:
def get_test_train(X,Y):
    X_train = X.iloc[:-1000,:]
    X_train = X_train[3000:len(X_train)]
    Y_train = Y.iloc[:-1000,:]
    Y_train = Y_train[3000:len(Y_train)]
    X_test = X.iloc[-1000:,:]
    Y_test = Y.iloc[-1000:,:]
    return X_train,Y_train,X_test,Y_test
print(get_test_train(X,Y))

(           age_0   ht_0  hand_0  rank_0  rank_points_0  \
3000   27.835729  185.0       1    38.0          945.0   
3001   26.294319  180.0       1   100.0          410.0   
3002   28.878850  175.0       1    32.0         1040.0   
3003   27.444216  183.0       1   102.0          404.0   
3004   19.871321  180.0       1     7.0         2560.0   
...          ...    ...     ...     ...            ...   
55697  23.570157  180.0       1    71.0          986.0   
55698  31.331964  183.0       0    92.0          828.0   
55699  32.038330  183.0       1    43.0         1521.0   
55700  25.210130  196.0       1     9.0         4468.0   
55701  27.362081  183.0       1   131.0          607.0   

       wins_semester_level1_0  wins_semester_level2_0  wins_semester_level3_0  \
3000                        0                       2                       4   
3001                        0                       0                       1   
3002                        0                       3      

In [18]:
for i,X_scaled in enumerate([X_scaledmx, X_scaledstd]):
    if i == 0:
        print("\nMIN MAX")
    else:
        print("\nSTANDARD")
    X_scaled_2 = X_scaled.copy()
    X_scaled_2 = X_scaled_2.dropna()

    X_scaled_1 = X_scaled.copy()
    # check where are Nans and replace them with gaussian 
    for c in X_scaled.columns:
        if X_scaled[c].isna().sum() > 0:
            #print(c, X_scaled[c].isna().sum())
            generate_data_on_Nan(c, X_scaled_1)
    
    pca = PCA(n_components=80, svd_solver='full')
    X_scaled_2 = pd.DataFrame(pca.fit_transform(X_scaled_2))
    X_scaled_1 = pd.DataFrame(pca.fit_transform(X_scaled_1))
    X_train1,Y_train1,X_test1,Y_test1 = get_test_train(X_scaled_1, Y)
    X_train2,Y_train2,X_test2,Y_test2 = get_test_train(X_scaled_2, Y_delete_nans)
    
    
    clf = GradientBoostingClassifier().fit(X_train1,Y_train1)
    predictions = clf.predict(X_test1)
    acc = accuracy_score(Y_test1, predictions)
    print("accuracy generate Gradient Boosting Classifier: ", acc)
    
    
    clf = GradientBoostingClassifier().fit(X_train2,Y_train2)
    predictions = clf.predict(X_test2)
    acc = accuracy_score(Y_test2, predictions)
    print("accuracy delete Gradient Boosting Classifier: ", acc)
    
    clf = RandomForestClassifier().fit(X_train1,Y_train1)
    predictions = clf.predict(X_test1)
    acc = accuracy_score(Y_test1, predictions)
    print("accuracy generate Random Forest: ", acc)
    
    
    clf = RandomForestClassifier().fit(X_train2,Y_train2)
    predictions = clf.predict(X_test2)
    acc = accuracy_score(Y_test2, predictions)
    print("accuracy delete Random Forest: ", acc)
    
    
    
    clf = LogisticRegression().fit(X_train1,Y_train1)
    predictions = clf.predict(X_test1)
    acc = accuracy_score(Y_test1, predictions)
    print("accuracy generate logistic reg: ", acc)
    
    clf = LogisticRegression().fit(X_train2,Y_train2)
    predictions = clf.predict(X_test2)
    acc = accuracy_score(Y_test2, predictions)
    print("accuracy delete logistic reg: ", acc)
    
    if i != 1: # cant perform xgboost on standard normalization
        clf = XGBClassifier(use_label_encoder=False).fit(X_train1,Y_train1, eval_metric='rmse')
        predictions = clf.predict(X_test1)
        acc = accuracy_score(Y_test1, predictions)
        print("accuracy generate xgboost: ", acc)
        clf = XGBClassifier(use_label_encoder=False).fit(X_train2,Y_train2, eval_metric='rmse')
        predictions = clf.predict(X_test2)
        acc = accuracy_score(Y_test2, predictions)
        print("accuracy delete xgboost: ", acc)
    


MIN MAX


  y = column_or_1d(y, warn=True)


accuracy generate Gradient Boosting Classifier:  0.652


  y = column_or_1d(y, warn=True)


accuracy delete Gradient Boosting Classifier:  0.613


  clf = RandomForestClassifier().fit(X_train1,Y_train1)


accuracy generate Random Forest:  0.641


  clf = RandomForestClassifier().fit(X_train2,Y_train2)


accuracy delete Random Forest:  0.635


  y = column_or_1d(y, warn=True)


accuracy generate logistic reg:  0.667


  y = column_or_1d(y, warn=True)


accuracy delete logistic reg:  0.634
accuracy generate xgboost:  0.639
accuracy delete xgboost:  0.606

STANDARD


  y = column_or_1d(y, warn=True)


accuracy generate Gradient Boosting Classifier:  0.665


  y = column_or_1d(y, warn=True)


accuracy delete Gradient Boosting Classifier:  0.639


  clf = RandomForestClassifier().fit(X_train1,Y_train1)


accuracy generate Random Forest:  0.637


  clf = RandomForestClassifier().fit(X_train2,Y_train2)


accuracy delete Random Forest:  0.61


  y = column_or_1d(y, warn=True)


accuracy generate logistic reg:  0.673
accuracy delete logistic reg:  0.633


  y = column_or_1d(y, warn=True)


In [15]:
df_correlation = pd.DataFrame(columns = ['feature','correlation'])
for columns in final_df:
    if type(final_df[columns])== pd.core.series.Series:
        if type(final_df[columns][0])!=str:
            final_df[columns] = np.where(np.isnan(np.float64(final_df[columns])), max(final_df[columns]),final_df[columns])
            if -1.0 < stats.pointbiserialr(final_df['target'],final_df[columns])[0] and stats.pointbiserialr(final_df['target'],final_df[columns])[0] < 1.0:
                df_correlation = df_correlation.append({'feature':columns,'correlation':stats.pointbiserialr(final_df['target'],final_df[columns])[0]},ignore_index=True)
df_correlation=df_correlation.sort_values(by = 'correlation', key = lambda col: -abs(col))         
print(df_correlation)


NameError: name 'final_df' is not defined

In [None]:
maximum_ranking_points = max(final_df['rank_points_0'])
minimum_ranking_points = min(final_df['rank_points_0'])
rank_0_points_groups = np.linspace(minimum_ranking_points,maximum_ranking_points,num=20)
win_ratio_rank_0_points = [ final_df.loc[(final_df['rank_points_0']>=rank_0_points_groups[i]) & (final_df['target']==0)
                           & (final_df['rank_points_0']<rank_0_points_groups[i+1]) ].shape[0] /
                           final_df.loc[ (final_df['rank_points_0']>= rank_0_points_groups[i])
                                       & (final_df['rank_points_0']< rank_0_points_groups[i+1])].shape[0] for i in range(0,19)]
plt.plot(rank_0_points_groups[0:19],win_ratio_rank_0_points)
plt.ylabel('win_ratio')
plt.xlabel('ranking_points')
plt.show()

In [None]:
maximum_age = max(final_df['age_0'])
minimum_age = min(final_df['age_0'])
age_0_groups = np.linspace(minimum_age,maximum_age,num=20)
win_ratio_age = [ final_df.loc[(final_df['age_0']>=age_0_groups[i]) & (final_df['target']==0) &
                               (final_df['age_0']<age_0_groups[i+1]) ].shape[0] /
                           final_df.loc[(final_df['age_0']>= age_0_groups[i]) & (final_df['age_0']<age_0_groups[i+1]) ].shape[0] for i in range(0,19) ]
plt.plot(age_0_groups[0:19],win_ratio_age)
plt.ylabel('win_ratio')
plt.xlabel('normalize_age')
plt.show()