In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,log_loss, r2_score
import datetime
import dateutil
from dateutil.relativedelta import relativedelta
from datetime import date
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
from keras.optimizers import SGD
from keras import regularizers


Using TensorFlow backend.


In [None]:
data = pd.read_csv('data.csv', index_col='index')
data.Date = pd.to_datetime(data.Date)
data.set_index('Date', inplace=True)
data.sort_index(inplace=True)

len(data)

In [None]:
def one_hot(df, col):
    dums = pd.get_dummies(col).iloc[:,:-1]
    return pd.concat([df, dums], axis=1)

In [None]:
data.columns

In [None]:
cat_cols = ['Location','Tournament', 'Series', 'Court', 'Surface', 'Round','Best of' ]
val_cols = ['PtsDelta', 'IntervalDelta' , 'P1Age', 'P2Age', 'P1Result']

In [None]:
model_data = pd.DataFrame(index=data.index)

In [None]:
for i in val_cols+cat_cols:
    model_data[i] = data[i]

In [None]:
for i in cat_cols:
    model_data = one_hot(model_data, model_data[i])

In [None]:
model_data = model_data.drop(columns=cat_cols)

In [None]:
model_data.columns

In [None]:
odds = ['B365P1', 'B365P2']

In [None]:
data['total_over']= 1/data[odds[0]] + 1/data[odds[1]] -1
data['P1_implied'] = 1/data[odds[0]] - data['total_over']/2
data['P2_implied'] = 1/data[odds[1]]- data['total_over']/2
data['P1_implied_log'] =data['P1_implied'].apply(np.log)
data['P2_implied_log'] =data['P2_implied'].apply(np.log) 

implied_probs = ['P1_implied','P2_implied']


In [None]:
for i in implied_probs+odds:
    model_data[i] = data[i]
    
print(len(model_data))

model_data.dropna(inplace=True)
print(len(model_data))

In [None]:
start = 10000
train_model = model_data[:start].drop(columns=odds)
print(len(train_model))

In [None]:
scaler = StandardScaler()

X = train_model.drop(columns=['P1Result']).values
scaler.fit(X)
y = train_model['P1Result'].values
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)

In [None]:
lmodel = LogisticRegression(solver='liblinear', fit_intercept=True)
lmodel.fit(X_train,y_train)
predictions = lmodel.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))
print(classification_report(y_train, lmodel.predict(X_train)))

In [None]:
n_cols = X_train.shape[1]
len(X_train)

In [None]:
scaler = StandardScaler()

X = train_model.drop(columns=['P1Result']).values
scaler.fit(X)
y = train_model['P1Result'].values
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)

In [None]:
hist = History()
reg= regularizers.l2(0.01)
# reg= None
model = Sequential()
model.add(Dropout(rate=0.4))
model.add(Dense(100, activation='tanh', input_dim = n_cols,kernel_regularizer=reg))
model.add(Dense(50, activation='tanh',kernel_regularizer=reg))
# model.add(Dropout(rate=0.2))
model.add(Dense(10, activation='tanh'))
model.add(Dense(5, activation='tanh'))
model.add(Dense(1, activation='sigmoid' ))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])




In [None]:
model.fit(X_train, y_train, epochs=100,validation_split = .1, callbacks = [hist], batch_size=1000)

In [None]:
print(classification_report(y_test, model.predict_classes(X_test)))
print(classification_report(y_train,  model.predict_classes(X_train)))

In [None]:
plt.plot(hist.history['loss'], color = 'red')
plt.plot(hist.history['val_loss'], color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

In [None]:
start = 10000
val_data = model_data[start:]
val_odds = model_data[start:].P1_implied

In [None]:
val_x= val_data.drop(columns=['P1Result']+odds)
val_y = val_data.P1Result

In [None]:
val_x.shape

In [None]:
# print(classification_report(val_y,  val_x.P1_implied > 0.5))

print(classification_report(val_y,  model.predict_classes(scaler.transform(val_x.values))))

In [None]:
pred_probs = model.predict(scaler.transform(val_x.values))

pred_probs.shape

In [None]:

sim = model_data[start:][odds].copy()
sim['P1Prob'] = pred_probs
sim['P2Prob'] = 1 - pred_probs
sim['P1_implied'] = model_data[start:].P1_implied
sim['P2_implied'] = model_data[start:].P2_implied
sim['P1ER'] = sim['P1Prob']*sim['B365P1'] - sim['P2Prob']
sim['P2ER'] = sim['P2Prob']*sim['B365P2'] - sim['P1Prob']
sim['P1Kelly'] = (sim['P1Prob']*sim['B365P1'] -1)/(sim['B365P1']-1)
sim['P2Kelly'] = (sim['P2Prob']*sim['B365P2'] -1)/(sim['B365P2']-1)
sim['P1Result'] = model_data[start:].P1Result
sim['P2Result'] = 1- sim['P1Result']

In [None]:
sns.distplot(sim.P1_implied-sim.P1Prob,color='red')


In [None]:
scale_kelly = 0.1
balance = 1000

results = sim.copy


track_balance=[]

stake =[]
i=0

while balance>0 and i<len(sim):
    p1_kelly = sim.iloc[i]['P1Kelly']
    p2_kelly = sim.iloc[i]['P2Kelly']
    p1_odds = sim.iloc[i]['B365P1']
    p2_odds = sim.iloc[i]['B365P2']
    p1_prob = sim.iloc[i]['P1Prob']
    p2_prob = sim.iloc[i]['P2Prob']
    p1_win = sim.iloc[i]['P1Result']
    p2_win = 1- p1_win

    if  p1_kelly > 0 :
        stake = balance*scale_kelly*p1_kelly
        balance += stake*p1_odds*p1_win - stake

#     if  p2_kelly > 0:
#         stake = balance*scale_kelly*p12_kelly
#         balance += stake*p2_odds*p2_win - stake

    track_balance.append(balance)

    i+=1



In [None]:
balance_s = pd.Series(data=track_balance,name='Balance', index=sim.index[:len(track_balance)])
plt.figure(figsize=(30,10))
sns.lineplot(data=balance_s)

In [None]:
results = sim.copy()
results['balance'] = balance_s
results['total_return'] = (results['balance'] - 1000)/ 1000*100

In [None]:
plt.figure(figsize=(30,10))
sns.lineplot(data=results['total_return'])

In [None]:
years = (results.index[-1]-results.index[1])
print("max_loss " + str(min(results.total_return)) )
print("max_gain " + str(max(results.total_return)) )
print("end_return " + str(results.total_return.iloc[-1]) )

In [None]:
results.total_return.mode()

In [None]:
results.corr()

In [None]:
for i in sim.iteritems():
    print (i[1])