In [1]:
import pandas as pd

In [2]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

In [3]:
matrix = pd.read_csv('minute-matrix.csv')

In [4]:
matrix.head()

Unnamed: 0,season,round,tie,made_minute,away,t1goal,t2goal,t1awaygoal,t2awaygoal,t1goals,t2goals,t1goal_diff,t1awaygoals,t2awaygoals,t1awaygoal_diff,t1home,t1,t2,winner,t1win
0,2008,first,arsenal-milan,1.0,,False,False,False,False,0,0,0,0,0,0,1,arsenal,milan,arsenal,1
1,2008,first,arsenal-milan,2.0,,False,False,False,False,0,0,0,0,0,0,1,arsenal,milan,arsenal,1
2,2008,first,arsenal-milan,3.0,,False,False,False,False,0,0,0,0,0,0,1,arsenal,milan,arsenal,1
3,2008,first,arsenal-milan,4.0,,False,False,False,False,0,0,0,0,0,0,1,arsenal,milan,arsenal,1
4,2008,first,arsenal-milan,5.0,,False,False,False,False,0,0,0,0,0,0,1,arsenal,milan,arsenal,1


In [5]:
matrix.tail()

Unnamed: 0,season,round,tie,made_minute,away,t1goal,t2goal,t1awaygoal,t2awaygoal,t1goals,t2goals,t1goal_diff,t1awaygoals,t2awaygoals,t1awaygoal_diff,t1home,t1,t2,winner,t1win
30236,2019,first,united-psg,180.0,,False,False,False,False,2,3,-1,2,2,0,0,united,psg,united,1
30237,2019,first,united-psg,181.0,,False,False,False,False,2,3,-1,2,2,0,0,united,psg,united,1
30238,2019,first,united-psg,182.0,,False,False,False,False,2,3,-1,2,2,0,0,united,psg,united,1
30239,2019,first,united-psg,183.0,,False,False,False,False,2,3,-1,2,2,0,0,united,psg,united,1
30240,2019,first,united-psg,184.0,a,True,False,True,False,3,3,0,3,2,1,0,united,psg,united,1


In [6]:
X = matrix[[
    'made_minute',
    't1goal_diff',
    't1awaygoal_diff',
]].values

In [7]:
y = matrix['t1win'].values

In [8]:
lrmodel = LogisticRegression(random_state=41, solver='liblinear')

In [9]:
scores = model_selection.cross_val_score(lrmodel, X, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.76 (+/- 0.09)


In [10]:
lrmodel.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=41, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
matrix['probt1win'] = [l[1] for l in lrmodel.predict_proba(X)]

In [12]:
matrix.head()

Unnamed: 0,season,round,tie,made_minute,away,t1goal,t2goal,t1awaygoal,t2awaygoal,t1goals,...,t1goal_diff,t1awaygoals,t2awaygoals,t1awaygoal_diff,t1home,t1,t2,winner,t1win,probt1win
0,2008,first,arsenal-milan,1.0,,False,False,False,False,0,...,0,0,0,0,1,arsenal,milan,arsenal,1,0.383038
1,2008,first,arsenal-milan,2.0,,False,False,False,False,0,...,0,0,0,0,1,arsenal,milan,arsenal,1,0.383491
2,2008,first,arsenal-milan,3.0,,False,False,False,False,0,...,0,0,0,0,1,arsenal,milan,arsenal,1,0.383944
3,2008,first,arsenal-milan,4.0,,False,False,False,False,0,...,0,0,0,0,1,arsenal,milan,arsenal,1,0.384397
4,2008,first,arsenal-milan,5.0,,False,False,False,False,0,...,0,0,0,0,1,arsenal,milan,arsenal,1,0.384851


In [13]:
matrix.tail()

Unnamed: 0,season,round,tie,made_minute,away,t1goal,t2goal,t1awaygoal,t2awaygoal,t1goals,...,t1goal_diff,t1awaygoals,t2awaygoals,t1awaygoal_diff,t1home,t1,t2,winner,t1win,probt1win
30236,2019,first,united-psg,180.0,,False,False,False,False,2,...,-1,2,2,0,0,united,psg,united,1,0.260202
30237,2019,first,united-psg,181.0,,False,False,False,False,2,...,-1,2,2,0,0,united,psg,united,1,0.260571
30238,2019,first,united-psg,182.0,,False,False,False,False,2,...,-1,2,2,0,0,united,psg,united,1,0.260941
30239,2019,first,united-psg,183.0,,False,False,False,False,2,...,-1,2,2,0,0,united,psg,united,1,0.261311
30240,2019,first,united-psg,184.0,a,True,False,True,False,3,...,0,3,2,1,0,united,psg,united,1,0.647226


In [14]:
matrix.to_csv('sklearn-pred-prob.csv', index=False)

In [15]:
start = 0
end = 1
step = 0.05

print('start|end|n|prob')
while start < end:
    subset = matrix[
        (matrix.probt1win >= start) & (matrix.probt1win < start + step)
    ]
    correct = subset.t1win.sum() / float(len(subset))
    print(f'{start}|{start + step}|{len(subset)}|{correct}')
    start += step

start|end|n|prob
0|0.05|4234|0.007794048181388758
0.05|0.1|1550|0.027741935483870966
0.1|0.15000000000000002|3104|0.14207474226804123
0.15000000000000002|0.2|288|0.0
0.2|0.25|1286|0.36625194401244165
0.25|0.3|2103|0.2943414170233
0.3|0.35|65|0.015384615384615385
0.35|0.39999999999999997|4011|0.3570181999501371
0.39999999999999997|0.44999999999999996|3088|0.4170984455958549
0.44999999999999996|0.49999999999999994|2109|0.4774774774774775
0.49999999999999994|0.5499999999999999|139|0.6546762589928058
0.5499999999999999|0.6|0|nan
0.6|0.65|2703|0.6437291897891232
0.65|0.7000000000000001|1381|0.6719768283852281
0.7000000000000001|0.7500000000000001|133|0.7969924812030075
0.7500000000000001|0.8000000000000002|46|0.391304347826087
0.8000000000000002|0.8500000000000002|1534|0.7855280312907431
0.8500000000000002|0.9000000000000002|110|1.0
0.9000000000000002|0.9500000000000003|1097|0.8997265268915223
0.9500000000000003|1.0000000000000002|1260|0.9825396825396825


  # Remove the CWD from sys.path while we load stuff.
