In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

%matplotlib inline

# Леса, градиентбустинг

## Первая модель, на статистических данных 

In [2]:
path = "C:/Users/RomanSivolobtsev/Documents/Статистика АПЛ/"
start_year = 2005
start_seasonses = pd.DataFrame() # 2 сезона для начальной выборки генерирования фич
train = pd.DataFrame() # 7 сезонов по которым будет обучаться модель 
test = pd.DataFrame() # 3 сезона для прогнозов

for i in range(2):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False)
    E1 = pd.read_csv(folder + 'E1.csv', index_col=False)
    E2 = pd.read_csv(folder + 'E2.csv', index_col=False)
    E3 = pd.read_csv(folder + 'E3.csv', index_col=False)
    start_seasonses = start_seasonses.append([E0, E1, E2, E3], ignore_index=True)

for i in range(2, 9):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False)
    E1 = pd.read_csv(folder + 'E1.csv', index_col=False)
    E2 = pd.read_csv(folder + 'E2.csv', index_col=False)
    E3 = pd.read_csv(folder + 'E3.csv', index_col=False)
    train = train.append([E0, E1, E2, E3], ignore_index=True)
    
for i in range(9, 12):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False) # прогноз ведём только по командам из АПЛ
    test = test.append(E0, ignore_index=True)
    
train = train[pd.notnull(train['Div'])]
test = test[pd.notnull(test['Div'])] # некоторые файлы в конце импортируют пустую строчку, эта для их отлова.

attribute_common = ["Div", "Date", "AwayTeam", "HomeTeam", "FTHG", "FTAG", "FTR", "Referee"] #общие признаки матча
attribute_statistic = ["HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HY", "AY", "HR", "AR"] #статистические признаки матча
attribute_bk = ['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'WHH', 'WHD', 'WHA',
                'VCH', 'VCD', 'VCA'] # букмекерские ставки на события

In [3]:
test.index = [x for x in range(len(test))]
train.index = [x for x in range(len(train))] # перебивка индексов, чтобы не было проблем

In [4]:
test[attribute_common][:5]

Unnamed: 0,Div,Date,AwayTeam,HomeTeam,FTHG,FTAG,FTR,Referee
0,E0,16/08/14,Crystal Palace,Arsenal,2.0,1.0,H,J Moss
1,E0,16/08/14,Everton,Leicester,2.0,2.0,D,M Jones
2,E0,16/08/14,Swansea,Man United,1.0,2.0,A,M Dean
3,E0,16/08/14,Hull,QPR,0.0,1.0,A,C Pawson
4,E0,16/08/14,Aston Villa,Stoke,0.0,1.0,A,A Taylor


Можно предсказывать исход после первого тайма, или точный счёт. Но для начала попробуем предсказывать признак FTR. Для этого нужно нагенерировать фич для каждой из участвующих команд, относительно статистических данных прошедших матчей. Букмекерские признаки пока не рассматриваем.

In [5]:
test[attribute_statistic + attribute_common][:5]

Unnamed: 0,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR,Div,Date,AwayTeam,HomeTeam,FTHG,FTAG,FTR,Referee
0,14.0,4.0,6.0,2.0,9.0,3.0,13.0,19.0,2.0,2.0,0.0,1.0,E0,16/08/14,Crystal Palace,Arsenal,2.0,1.0,H,J Moss
1,11.0,13.0,3.0,3.0,3.0,6.0,16.0,10.0,1.0,1.0,0.0,0.0,E0,16/08/14,Everton,Leicester,2.0,2.0,D,M Jones
2,14.0,5.0,5.0,4.0,4.0,0.0,14.0,20.0,2.0,4.0,0.0,0.0,E0,16/08/14,Swansea,Man United,1.0,2.0,A,M Dean
3,19.0,11.0,6.0,4.0,8.0,9.0,10.0,10.0,1.0,2.0,0.0,0.0,E0,16/08/14,Hull,QPR,0.0,1.0,A,C Pawson
4,12.0,7.0,2.0,2.0,2.0,8.0,14.0,9.0,0.0,3.0,0.0,0.0,E0,16/08/14,Aston Villa,Stoke,0.0,1.0,A,A Taylor


In [6]:
# Team1 - Home Team, Team2 - Away Team

def generate_statistic_features(Team1_HomeMatch, Team1_AwayMatch, Team2_HomeMatch, Team2_AwayMatch, Referee_Match):
    match_significance = np.array([1, 0.75, 0.5, 0.4, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05])
    match_significance = match_significance/sum(match_significance) # нормировка коэффициентов
    
    Team1_FTHG_average = Team1_HomeMatch['FTHG'].mean() # среднее количество забитых голов домашней командны в её домашних матчах
    Team2_FTAG_average = Team2_AwayMatch['FTAG'].mean() # среднее количество забитых голов гостевой команды в её гостевых матчах 
    Team1_FTHG_nearest = sum(Team1_HomeMatch['FTHG'][-10:]*match_significance) # количество забитых голов домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_FTAG_nearest = sum(Team2_AwayMatch['FTAG'][-10:]*match_significance) # количество забитых голов гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости
    
    Team1_FTHG_average = Team1_AwayMatch['FTHG'].mean() # среднее количество пропущенных голов домашней командны в её домашних матчах
    Team2_FTAG_average = Team2_HomeMatch['FTAG'].mean() # среднее количество пропущенных голов гостевой команды в её гостевых матчах 
    Team1_FTHG_nearest = sum(Team1_AwayMatch['FTHG'][-10:]*match_significance) # количество пропущенных голов домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_FTAG_nearest = sum(Team2_HomeMatch['FTAG'][-10:]*match_significance) # количество пропущенных голов гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости
    
    Team1_HS_nearest = sum(Team1_HomeMatch['HS'][-10:]*match_significance) # количество ударов по воротам домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_AS_nearest = sum(Team2_AwayMatch['AS'][-10:]*match_significance) # количество ударов по воротам гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости
    
    Team1_HST_nearest = sum(Team1_HomeMatch['HST'][-10:]*match_significance) # количество ударов в створ ворот домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_AST_nearest = sum(Team2_AwayMatch['AST'][-10:]*match_significance) # количество ударов в створ ворот гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости

    Team1_HC_nearest = sum(Team1_HomeMatch['HC'][-10:]*match_significance) # количество угловых домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_AC_nearest = sum(Team2_AwayMatch['AC'][-10:]*match_significance) # количество угловых гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости   

    Team1_HF_nearest = sum(Team1_HomeMatch['HF'][-10:]*match_significance) # количество фолов домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_AF_nearest = sum(Team2_AwayMatch['AF'][-10:]*match_significance) # количество фолов гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости

    Team1_HY_nearest = sum(Team1_HomeMatch['HY'][-10:]*match_significance) # количество жёлтых карточек домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_AY_nearest = sum(Team2_AwayMatch['AY'][-10:]*match_significance) # количество жёлтых карточек гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости

    Team1_HR_nearest = sum(Team1_HomeMatch['HR'][-10:]*match_significance) # количество красных карточек домашней командны в её последних 10 домашних матчах с коэфициентами значимости
    Team2_AR_nearest = sum(Team2_AwayMatch['AR'][-10:]*match_significance) # количество красных карточек гостевой команды в её последних 10 гостевых матчах с коэфициентами значимости
    
    Team1_HFTR_nearest = sum(Team1_HomeMatch['FTR'][-20:].replace(to_replace=["H", "D", "A"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 матчей 
    Team2_AFTR_nearest = sum(Team2_AwayMatch['FTR'][-20:].replace(to_replace=["A", "D", "H"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 матчей
                            
    Referee_FTG_average = Referee_Match['FTHG'].mean() + Referee_Match['FTAG'].mean() # cреднее количество забитых голов в матчах, которые обслуживал этот судья
    Referee_S_average = Referee_Match['HS'].mean() + Referee_Match['AS'].mean() # cреднее количество ударов по воротам в матчах, которые обслуживал этот судья
    Referee_ST_average = Referee_Match['HST'].mean() + Referee_Match['AST'].mean() # cреднее количество ударов в створ ворот в матчах, которые обслуживал этот судья
    Referee_C_average = Referee_Match['HC'].mean() + Referee_Match['AC'].mean() # cреднее количество угловых в матчах, которые обслуживал этот судья
    Referee_F_average = Referee_Match['HF'].mean() + Referee_Match['AF'].mean() # cреднее количество фолов в матчах, которые обслуживал этот судья
    Referee_Y_average = Referee_Match['HY'].mean() + Referee_Match['AY'].mean() # cреднее количество желтых карточек в матчах, которые обслуживал этот судья
    Referee_R_average = Referee_Match['HR'].mean() + Referee_Match['AR'].mean() # cреднее количество красных карточек в матчах, которые обслуживал этот судья

    return [Team1_FTHG_average, Team2_FTAG_average, Team1_FTHG_nearest, Team2_FTAG_nearest, Team1_FTHG_average, 
            Team2_FTAG_average,Team1_FTHG_nearest, Team2_FTAG_nearest, Team1_HS_nearest, Team2_AS_nearest, 
            Team1_HST_nearest, Team2_AST_nearest, Team1_HC_nearest, Team2_AC_nearest, Team1_HF_nearest, Team2_AF_nearest,
            Team1_HY_nearest, Team2_AY_nearest, Team1_HR_nearest, Team2_AR_nearest, Team1_HFTR_nearest, Team2_AFTR_nearest,
            Referee_FTG_average, Referee_S_average, Referee_ST_average, Referee_C_average, Referee_F_average,
            Referee_Y_average, Referee_R_average]

In [7]:
Teams = train['HomeTeam'].append(test['HomeTeam']).unique() # Список всех команд за 10 лет в 4 английских дивизионах
Teams_HomeData = {Teams[i]: start_seasonses[start_seasonses["HomeTeam"] == Teams[i]] for i in range(len(Teams))} # домашние матчи каждой прогназируемой команды
Teams_AwayData = {Teams[i]: start_seasonses[start_seasonses["AwayTeam"] == Teams[i]] for i in range(len(Teams))} # гостевые матчи каждой прогназируемой команды

Referees = train['Referee'].append(test['Referee']).unique()
Referees_Data = {Referees[i]: start_seasonses[start_seasonses["Referee"] == Referees[i]] for i in range(len(Referees))} # матчи каждого рефери


In [8]:
# время на подсчёт вычисления фич для одного матча

start = time.time()
generate_statistic_features(Teams_HomeData["Arsenal"], Teams_AwayData["Arsenal"], Teams_HomeData["Liverpool"], 
                            Teams_AwayData["Liverpool"], Referees_Data["A Taylor"])
end = time.time()
print(end - start)

0.015625953674316406


In [9]:
Teams_HomeData["Arsenal"][-10:]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA
2215,E0,23/12/06,Arsenal,Blackburn,6,2,H,3,1,H,...,2.05,1.91,1.91,1.82,26.0,-1.25,1.95,1.9,2.03,1.97
2252,E0,02/01/07,Arsenal,Charlton,4,0,H,2,0,H,...,1.7,1.62,2.35,2.13,22.0,-1.75,2.12,2.03,1.87,1.83
2272,E0,21/01/07,Arsenal,Man United,2,1,H,0,0,D,...,2.25,2.1,1.74,1.65,27.0,0.0,1.85,1.79,2.15,2.01
2301,E0,11/02/07,Arsenal,Wigan,2,1,H,0,1,A,...,1.8,1.72,2.17,2.03,27.0,-1.5,1.9,1.84,2.12,2.0
2313,E0,03/03/07,Arsenal,Reading,2,1,H,0,0,D,...,1.9,1.78,2.14,1.95,27.0,-1.0,1.96,1.87,2.04,1.97
2347,E0,07/04/07,Arsenal,West Ham,0,1,A,0,1,A,...,1.91,1.82,2.01,1.91,26.0,-1.25,2.04,2.02,1.89,1.85
2361,E0,14/04/07,Arsenal,Bolton,2,1,H,1,1,D,...,2.25,2.0,1.8,1.72,22.0,-0.75,1.95,1.93,2.0,1.95
2369,E0,17/04/07,Arsenal,Man City,3,1,H,1,1,D,...,2.04,1.88,1.94,1.86,31.0,-1.25,2.1,2.07,1.88,1.83
2392,E0,29/04/07,Arsenal,Fulham,3,1,H,1,0,H,...,1.91,1.76,2.1,1.95,30.0,-1.25,1.96,1.87,2.08,1.99
2402,E0,06/05/07,Arsenal,Chelsea,1,1,D,1,0,H,...,2.2,2.05,1.75,1.69,28.0,0.0,2.05,1.99,1.89,1.8


In [10]:
generate_statistic_features(Teams_HomeData["Arsenal"], Teams_AwayData["Arsenal"], Teams_HomeData["Liverpool"], Teams_AwayData["Liverpool"], Referees_Data["A Taylor"])

[0.9736842105263158,
 0.39473684210526316,
 0.91891891891891897,
 0.14864864864864866,
 0.9736842105263158,
 0.39473684210526316,
 0.91891891891891897,
 0.14864864864864866,
 16.756756756756758,
 14.094594594594597,
 9.9054054054054053,
 8.378378378378379,
 8.5405405405405421,
 6.1216216216216228,
 11.594594594594595,
 11.905405405405407,
 1.8513513513513518,
 1.1216216216216217,
 0.0,
 0.0,
 45,
 25,
 1.6956521739130435,
 20.608695652173914,
 10.695652173913043,
 10.043478260869566,
 24.826086956521742,
 3.0869565217391304,
 0.17391304347826086]

In [11]:
features = ["Team1_FTHG_average", "Team2_FTAG_average", "Team1_FTHG_nearest", "Team2_FTAG_nearest", "Team1_FTHG_average", 
           "Team2_FTAG_average","Team1_FTHG_nearest", "Team2_FTAG_nearest", "Team1_HS_nearest", "Team2_AS_nearest", 
           "Team1_HST_nearest", "Team2_AST_nearest", "Team1_HC_nearest", "Team2_AC_nearest", "Team1_HF_nearest", 
           "Team2_AF_nearest", "Team1_HY_nearest", "Team2_AY_nearest", "Team1_HR_nearest", "Team2_AR_nearest",
           "Team1_HFTR_nearest", "Team2_AFTR_nearest", "Referee_FTG_average", "Referee_S_average", "Referee_ST_average",
           "Referee_C_average", "Referee_F_average", "Referee_Y_average", "Referee_R_average"]

In [12]:
train_X = pd.DataFrame(columns = features)
train_Y = []
test_X = pd.DataFrame(columns = features)
test_Y = test["FTR"]

# выборка на 8 сезонах для обучения

for i in range(len(train)):
    row = train.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    Referee = row["Referee"]
    
    if (len(Teams_HomeData[Team1]) >= 20) & (len(Teams_AwayData[Team2]) >= 20) & (len(Referees_Data[Referee]) >= 5):
    # если команда только влетела в каком-то сезоне, то на неё нет никакой информации
        feature = generate_statistic_features(Teams_HomeData[Team1], Teams_AwayData[Team1], Teams_HomeData[Team2], Teams_AwayData[Team2], Referees_Data[Referee])
        train_X.loc[i] = feature
        train_Y.append(train["FTR"].loc[i])
        
    Teams_HomeData[Team1] = Teams_HomeData[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData[Team2] = Teams_AwayData[Team2].append(row, ignore_index=True)
    Referees_Data[Referee] = Referees_Data[Referee].append(row, ignore_index=True)

# выборка на 2 неполных сезонах для построения прогнозов

for i in range(len(test)):
    row = test.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    Referee = row["Referee"]
    
    feature = generate_statistic_features(Teams_HomeData[Team1], Teams_AwayData[Team1], Teams_HomeData[Team2], Teams_AwayData[Team2], Referees_Data[Referee])
    test_X.loc[i] = feature
    Teams_HomeData[Team1] = Teams_HomeData[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData[Team2] = Teams_AwayData[Team2].append(row, ignore_index=True)
    Referees_Data[Referee] = Referees_Data[Referee].append(row, ignore_index=True)

In [13]:
test_X = test_X.replace(to_replace="NaN", value=0) # это не повлияет на результаты, так как в фичах берётся максимум из всех БК

In [14]:
tuned_parameters = [{'min_samples_split': [1, 2], 'max_depth': [2, 3, 4], 'learning_rate': [0.02, 0.03, 0.04],
                     'min_samples_leaf':[30, 50, 70], 'n_estimators': [30, 50, 70]}]

GB_classifier = GridSearchCV(GradientBoostingClassifier(), tuned_parameters)
GB_classifier.fit(train_X, train_Y)
print(GB_classifier.best_estimator_)

GradientBoostingClassifier(init=None, learning_rate=0.04, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=50, min_samples_split=1,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


In [155]:
GB_classifier = GradientBoostingClassifier(min_samples_split = 2, max_depth = 3, learning_rate = 0.03,
                                           min_samples_leaf = 50, n_estimators = 50, subsample=0.9)
GB_classifier.fit(train_X, train_Y)

GradientBoostingClassifier(init=None, learning_rate=0.03, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=50, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=0.9, verbose=0,
              warm_start=False)

In [146]:
pd.DataFrame([GB_classifier.feature_importances_], columns = train_X.columns)

Unnamed: 0,Team1_FTHG_average,Team2_FTAG_average,Team1_FTHG_nearest,Team2_FTAG_nearest,Team1_FTHG_average.1,Team2_FTAG_average.1,Team1_FTHG_nearest.1,Team2_FTAG_nearest.1,Team1_HS_nearest,Team2_AS_nearest,...,Team2_AR_nearest,Team1_HFTR_nearest,Team2_AFTR_nearest,Referee_FTG_average,Referee_S_average,Referee_ST_average,Referee_C_average,Referee_F_average,Referee_Y_average,Referee_R_average
0,0.030005,0.054644,0.021821,0.018047,0.027482,0.038357,0.021185,0.017611,0.057483,0.064695,...,0.006756,0.158866,0.106836,0.019682,0.052134,0.01588,0.013332,0.018585,0.015943,0.029466


Главные фичи - колличество набранных очков за последние 20 матчей домашней и гостевой командами.

In [156]:
print("На тренировочной выборке: " + str(accuracy_score(train_Y, GB_classifier.predict(train_X))))

proba_GB = GB_classifier.predict_proba(test_X)
prediction_GB = GB_classifier.predict(test_X)

result_GB = pd.DataFrame()
result_GB[0] = proba_GB[:,0]
result_GB[1] = proba_GB[:,1]
result_GB[2] = proba_GB[:,2]
result_GB[3] = prediction_GB
result_GB[4] = test_Y

result_GB.columns = ["A_prob", "D_prob", "H_prob", "prediction", "result"]

score_GB = accuracy_score(test_Y, result_GB["prediction"])
print("На тестовой выборке: " + str(score_GB))

result_GB[:15]

На тренировочной выборке: 0.462789150839
На тестовой выборке: 0.517543859649


Unnamed: 0,A_prob,D_prob,H_prob,prediction,result
0,0.213273,0.24535,0.541377,H,H
1,0.262561,0.261708,0.475731,H,D
2,0.247637,0.295854,0.456509,H,A
3,0.237212,0.28703,0.475758,H,A
4,0.261695,0.28703,0.451275,H,A
5,0.339337,0.296012,0.364652,H,D
6,0.375117,0.286748,0.338135,A,A
7,0.182276,0.217258,0.600466,H,H
8,0.41962,0.294669,0.285712,A,A
9,0.298086,0.250005,0.451909,H,A


In [18]:
result_GB["prediction"].value_counts()

H    842
A    296
D      2
Name: prediction, dtype: int64

In [19]:
test["FTR"].value_counts()

H    516
A    340
D    284
Name: FTR, dtype: int64

Из-за объёмов выборки(несколько лиг) и большого количества шума, модель плохо обучается. Попробуем ужать выборку и выкинуть ненужные фичи типо красных карточек заработанных командой.

In [22]:
len(test_X)

1140

## Вторая модель, на модернизированных статистических данных 

In [23]:
path = "C:/Users/RomanSivolobtsev/Documents/Статистика АПЛ/"
start_year = 2005
start_seasonses2 = pd.DataFrame() # 2 сезона для начальной выборки генерирования фич
train2 = pd.DataFrame() # 7 сезонов по которым будет обучаться модель 
test2 = pd.DataFrame() # 3 сезона для прогнозов

for i in range(2):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False)
    E1 = pd.read_csv(folder + 'E1.csv', index_col=False)
    E2 = pd.read_csv(folder + 'E2.csv', index_col=False)
    E3 = pd.read_csv(folder + 'E3.csv', index_col=False)
    start_seasonses2 = start_seasonses2.append([E0, E1, E2, E3], ignore_index=True)

for i in range(2, 9):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False)
    train2 = train2.append([E0], ignore_index=True)
    
for i in range(9, 12):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False) # прогноз ведём только по командам из АПЛ
    test2 = test2.append(E0, ignore_index=True)
    
train2 = train2[pd.notnull(train2['Div'])]
test2 = test2[pd.notnull(test2['Div'])] # некоторые файлы в конце импортируют пустую строчку, эта для их отлова.

In [24]:
test2.index = [x for x in range(len(test2))]
train2.index = [x for x in range(len(train2))] # перебивка индексов, чтобы не было проблем

In [25]:
# Team1 - Home Team, Team2 - Away Team

def generate_statistics_features2(Team1_HomeMatch, Team1_AwayMatch, Team2_HomeMatch, Team2_AwayMatch, Referee_Match, BK):
    match_significance = np.array([1, 0.7, 0.5, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05])
    match_significance = match_significance/sum(match_significance) # нормировка коэффициентов
    
    Team1_FTHG_average = Team1_HomeMatch['FTHG'].mean() # среднее количество забитых голов домашней командны в её домашних матчах
    Team2_FTAG_average = Team2_AwayMatch['FTAG'].mean() # среднее количество забитых голов гостевой команды в её гостевых матчах 
    FTG_nearest_diff = sum(Team1_HomeMatch['FTHG'][-10:]*match_significance) - sum(Team2_AwayMatch['FTAG'][-10:]*match_significance)
   
    FTLG_nearest_diff = sum(Team1_AwayMatch['FTHG'][-10:]*match_significance)-sum(Team2_HomeMatch['FTAG'][-10:]*match_significance) 
    # разница пропущенных голов домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости
    
    S_nearest_diff = sum(Team1_HomeMatch['HS'][-10:]*match_significance)-sum(Team2_AwayMatch['AS'][-10:]*match_significance) 
    # разница ударов по воротам домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости
    
    ST_nearest_diff = sum(Team1_HomeMatch['HST'][-10:]*match_significance) - sum(Team2_AwayMatch['AST'][-10:]*match_significance) 
    # разница ударов в створ ворот домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости

    C_nearest_diff = sum(Team1_HomeMatch['HC'][-10:]*match_significance) -sum(Team2_AwayMatch['AC'][-10:]*match_significance) 
    # разница угловых домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости   

    Team1_HFTR_nearest = sum(Team1_HomeMatch['FTR'][-5:].replace(to_replace=["H", "D", "A"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 домашних матчей 
    Team2_AFTR_nearest = sum(Team2_AwayMatch['FTR'][-5:].replace(to_replace=["A", "D", "H"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 гостевых матчей
     
    Team1_HFTR_nearest2 = sum(Team1_HomeMatch['FTR'][-15:].replace(to_replace=["H", "D", "A"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 домашних матчей 
    Team2_AFTR_nearest2 = sum(Team2_AwayMatch['FTR'][-15:].replace(to_replace=["A", "D", "H"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 гостевых матчей
                     
    Team1_HFTR_nearest3 = sum(Team1_HomeMatch['FTR'][-30:].replace(to_replace=["H", "D", "A"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 домашних матчей 
    Team2_AFTR_nearest3 = sum(Team2_AwayMatch['FTR'][-30:].replace(to_replace=["A", "D", "H"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 гостевых матчей
                     
    Referee_HomeTeam_points = Referee_Match['FTR'][-10:].replace(to_replace=["H", "D", "A"], value=[1, 0, 0]).mean() # процент побед домашних команд в последних 10 матчах судьи 
    Referee_AwayTeam_points = Referee_Match['FTR'][-10:].replace(to_replace=["H", "D", "A"], value=[0, 0, 1]).mean() # процент ничейных результатов в последних 10 матчах судьи 
    Referee_Draw_points = Referee_Match['FTR'][-10:].replace(to_replace=["H", "D", "A"], value=[0, 1, 0]).mean()     # процент побед гостевых команд в последних 10 матчах судьи 

    return [Team1_FTHG_average, Team2_FTAG_average, FTG_nearest_diff, FTLG_nearest_diff, S_nearest_diff, ST_nearest_diff, 
            C_nearest_diff, Team1_HFTR_nearest, Team2_AFTR_nearest, Team1_HFTR_nearest2, Team2_AFTR_nearest2,
            Team1_HFTR_nearest3, Team2_AFTR_nearest3, Referee_HomeTeam_points, Referee_AwayTeam_points, Referee_Draw_points]

In [26]:
features2 = ["Team1_FTHG_average", "Team2_FTAG_average", "FTG_nearest_diff", "FTLG_nearest_diff", "S_nearest_diff", "ST_nearest_diff", 
             "C_nearest_diff", "Team1_HFTR_nearest", "Team2_AFTR_nearest", "Team1_HFTR_nearest2", "Team2_AFTR_nearest2",
             "Team1_HFTR_nearest3", "Team2_AFTR_nearest3", "Referee_HomeTeam_points", "Referee_AwayTeam_points",
             "Referee_Draw_points"]

In [27]:
Teams2 = train2['HomeTeam'].append(test2['HomeTeam']).unique() # Список всех команд за 8 лет в 2 английских дивизионах
Teams_HomeData2 = {Teams2[i]: start_seasonses2[start_seasonses2["HomeTeam"] == Teams2[i]] for i in range(len(Teams2))} # домашние матчи каждой прогназируемой команды
Teams_AwayData2 = {Teams2[i]: start_seasonses2[start_seasonses2["AwayTeam"] == Teams2[i]] for i in range(len(Teams2))} # гостевые матчи каждой прогназируемой команды

Referees2 = train2['Referee'].append(test2['Referee']).unique()
Referees_Data2 = {Referees2[i]: start_seasonses2[start_seasonses2["Referee"] == Referees2[i]] for i in range(len(Referees2))} # матчи каждого рефери


In [28]:
train_X2 = pd.DataFrame(columns = features2)
train_Y2 = []
test_X2 = pd.DataFrame(columns = features2)
test_Y2 = test2["FTR"]

# выборка на 7 сезонах для обучения

for i in range(len(train2)):
    row = train2.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    Referee = row["Referee"]
    
    if (len(Teams_HomeData2[Team1]) >= 10) & (len(Teams_AwayData2[Team2]) >= 10) & (len(Referees_Data2[Referee]) >= 10):
    # если команда только влетела в каком-то сезоне, то на неё нет достаточно информации
        BK =  row[attribute_bk]
        feature = generate_statistics_features2(Teams_HomeData2[Team1], Teams_AwayData2[Team1], Teams_HomeData2[Team2], 
                                                Teams_AwayData2[Team2], Referees_Data2[Referee], BK)
        train_X2.loc[i] = feature
        train_Y2.append(train2["FTR"].loc[i])
        
    Teams_HomeData2[Team1] = Teams_HomeData2[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData2[Team2] = Teams_AwayData2[Team2].append(row, ignore_index=True)
    Referees_Data2[Referee] = Referees_Data2[Referee].append(row, ignore_index=True)

# выборка на 3 неполных сезонах для построения прогнозов

for i in range(len(test2)):
    row = test2.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    Referee = row["Referee"]
    BK =  row[attribute_bk]
    
    feature = generate_statistics_features2(Teams_HomeData2[Team1], Teams_AwayData2[Team1], Teams_HomeData2[Team2], 
                                            Teams_AwayData2[Team2], Referees_Data2[Referee], BK)
    test_X2.loc[i] = feature
    Teams_HomeData2[Team1] = Teams_HomeData2[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData2[Team2] = Teams_AwayData2[Team2].append(row, ignore_index=True)
    Referees_Data2[Referee] = Referees_Data2[Referee].append(row, ignore_index=True)

In [29]:
test_X2 = test_X2.replace(to_replace="NaN", value=0) # это не повлияет на результаты, так как в фичах берётся максимум из всех БК

In [30]:
tuned_parameters = [{'min_samples_split': [1, 2], 'max_depth': [2, 3, 4], 'learning_rate': [0.02, 0.03, 0.04],
                     'min_samples_leaf':[25, 50, 75], 'n_estimators': [30, 50, 70]}]

GB_classifier2 = GridSearchCV(GradientBoostingClassifier(), tuned_parameters)
GB_classifier2.fit(train_X2, train_Y2)
print(GB_classifier2.best_estimator_)

GradientBoostingClassifier(init=None, learning_rate=0.03, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=25, min_samples_split=1,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


In [31]:
GB_classifier2 = GradientBoostingClassifier(min_samples_split = 1, max_depth = 3, learning_rate = 0.03, 
                                            random_state = 42, min_samples_leaf = 25, n_estimators = 30)
GB_classifier2.fit(train_X2, train_Y2)

print("На тренировочной выборке: " + str(accuracy_score(train_Y2, GB_classifier2.predict(train_X2))))

proba_GB2 = GB_classifier2.predict_proba(test_X2)
prediction_GB2 = GB_classifier2.predict(test_X2)

result_GB2 = pd.DataFrame()
result_GB2[0] = proba_GB2[:,0]
result_GB2[1] = proba_GB2[:,1]
result_GB2[2] = proba_GB2[:,2]
result_GB2[3] = prediction_GB2
result_GB2[4] = test_Y2

result_GB2.columns = ["A", "D", "H", "prediction", "result"]

score_GB2 = accuracy_score(test_Y2, result_GB2["prediction"])
print("На тестовой выборке: " + str(score_GB2))

result_GB2[100:115]

На тренировочной выборке: 0.540964777948
На тестовой выборке: 0.524561403509


Unnamed: 0,A,D,H,prediction,result
100,0.268742,0.276071,0.455186,H,H
101,0.306154,0.348428,0.345419,D,A
102,0.246777,0.312169,0.441055,H,H
103,0.503614,0.256338,0.240049,A,D
104,0.227697,0.288311,0.483992,H,H
105,0.308813,0.295967,0.39522,H,D
106,0.326087,0.273301,0.400612,H,D
107,0.502237,0.262989,0.234773,A,H
108,0.262757,0.275821,0.461421,H,A
109,0.264117,0.297812,0.438071,H,A


In [113]:
pd.DataFrame([GB_classifier2.feature_importances_], columns = train_X2.columns)

Unnamed: 0,Team1_FTHG_average,Team2_FTAG_average,FTG_nearest_diff,FTLG_nearest_diff,S_nearest_diff,ST_nearest_diff,C_nearest_diff,Team1_HFTR_nearest,Team2_AFTR_nearest,Team1_HFTR_nearest2,Team2_AFTR_nearest2,Team1_HFTR_nearest3,Team2_AFTR_nearest3,Referee_HomeTeam_points,Referee_AwayTeam_points,Referee_Draw_points
0,0.053299,0.056274,0.025265,0.095068,0.080266,0.149117,0.028472,0.025056,0.008444,0.034373,0.002671,0.192861,0.208487,0.018238,0.013597,0.008513


Для проверки, в этой выборке параметр количества набранных очков за последние 10 игр был разбит на три: за 5, 15 и 30 игр. Оказалось, что параметр за 30 игр имеет большой вес, а предыдущие не имеют осбого смысла.

In [33]:
result_GB2["prediction"].value_counts()

H    838
A    294
D      8
Name: prediction, dtype: int64

In [111]:
confusion_matrix(test_Y2, result_GB2["prediction"])

array([[152,   1, 187],
       [ 71,   4, 209],
       [ 71,   3, 442]])

Классификатор в отрез отказывается предсказывать ничьи. Попробуем урезать выборку, заменить парные параметры на их разность(например смотреть не количество очков команд за последние 20 туров, а разницу между двумя командами), а также добавим параметры отвечающие за ставки букмекеров на исходы HomeTeamWin, Draw, AwayTeamWin

## Третья модель, на модернизированных статистических данных + коэффициентов БК

In [119]:
path = "C:/Users/RomanSivolobtsev/Documents/Статистика АПЛ/"
start_year = 2005
start_seasonses3 = pd.DataFrame() # 2 сезона для начальной выборки генерирования фич
train3 = pd.DataFrame() # 7 сезонов по которым будет обучаться модель 
test3 = pd.DataFrame() # 3 сезона для прогнозов

for i in range(2):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False)
    E1 = pd.read_csv(folder + 'E1.csv', index_col=False)
    E2 = pd.read_csv(folder + 'E2.csv', index_col=False)
    E3 = pd.read_csv(folder + 'E3.csv', index_col=False)
    start_seasonses3 = start_seasonses3.append([E0, E1, E2, E3], ignore_index=True)

for i in range(2, 9):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False)
    train3 = train3.append([E0], ignore_index=True)
    
for i in range(9, 12):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    E0 = pd.read_csv(folder + 'E0.csv', index_col=False) # прогноз ведём только по командам из АПЛ
    test3 = test3.append(E0, ignore_index=True)
    
train3 = train3[pd.notnull(train3['Div'])]
test3 = test3[pd.notnull(test3['Div'])] # некоторые файлы в конце импортируют пустую строчку, эта для их отлова.

In [120]:
test3.index = [x for x in range(len(test3))]
train3.index = [x for x in range(len(train3))] # перебивка индексов, чтобы не было проблем

In [121]:
# Team1 - Home Team, Team2 - Away Team

def generate_mix_features(Team1_HomeMatch, Team1_AwayMatch, Team2_HomeMatch, Team2_AwayMatch, Referee_Match, BK):
    match_significance = np.array([1, 0.7, 0.5, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05])
    match_significance = match_significance/sum(match_significance) # нормировка коэффициентов
    
    Team1_FTHG_average = Team1_HomeMatch['FTHG'].mean() # среднее количество забитых голов домашней командны в её домашних матчах
    Team2_FTAG_average = Team2_AwayMatch['FTAG'].mean() # среднее количество забитых голов гостевой команды в её гостевых матчах 
    FTG_nearest_diff = sum(Team1_HomeMatch['FTHG'][-10:]*match_significance) - sum(Team2_AwayMatch['FTAG'][-10:]*match_significance)
   
    FTLG_nearest_diff = sum(Team1_AwayMatch['FTHG'][-10:]*match_significance)-sum(Team2_HomeMatch['FTAG'][-10:]*match_significance) 
    # разница пропущенных голов домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости
    
    S_nearest_diff = sum(Team1_HomeMatch['HS'][-10:]*match_significance)-sum(Team2_AwayMatch['AS'][-10:]*match_significance) 
    # разница ударов по воротам домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости
    
    ST_nearest_diff = sum(Team1_HomeMatch['HST'][-10:]*match_significance) - sum(Team2_AwayMatch['AST'][-10:]*match_significance) 
    # разница ударов в створ ворот домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости

    C_nearest_diff = sum(Team1_HomeMatch['HC'][-10:]*match_significance) -sum(Team2_AwayMatch['AC'][-10:]*match_significance) 
    # разница угловых домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости   

    Team1_HFTR_nearest = sum(Team1_HomeMatch['FTR'][-30:].replace(to_replace=["H", "D", "A"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 домашних матчей 
    Team2_AFTR_nearest = sum(Team2_AwayMatch['FTR'][-30:].replace(to_replace=["A", "D", "H"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 гостевых матчей
                    
    Referee_HomeTeam_points = Referee_Match['FTR'][-10:].replace(to_replace=["H", "D", "A"], value=[1, 0, 0]).mean() # процент побед домашних команд в последних 10 матчах судьи 
    Referee_AwayTeam_points = Referee_Match['FTR'][-10:].replace(to_replace=["H", "D", "A"], value=[0, 0, 1]).mean() # процент ничейных результатов в последних 10 матчах судьи 
    Referee_Draw_points = Referee_Match['FTR'][-10:].replace(to_replace=["H", "D", "A"], value=[0, 1, 0]).mean()     # процент побед гостевых команд в последних 10 матчах судьи 
    
    BK_Teams_diff = max(BK[["B365H", "BWH", "IWH", "LBH", "WHH", "VCH"]]) - max(BK[["B365A", "BWA", "IWA", "LBA", "WHA", "VCA"]])
    # разница коэффициентов на победу домашней и гостевой команд
    BK_Draw = max(BK[["B365D", "BWD", "IWD", "LBD", "WHD", "VCD"]]) # лучший коэффициент на ничью

    return [Team1_FTHG_average, Team2_FTAG_average, FTG_nearest_diff, FTLG_nearest_diff, S_nearest_diff, ST_nearest_diff, 
            C_nearest_diff, Team1_HFTR_nearest, Team2_AFTR_nearest, Referee_HomeTeam_points, Referee_AwayTeam_points,
            Referee_Draw_points, BK_Teams_diff, BK_Draw]

In [122]:
features3 = ["Team1_FTHG_average", "Team2_FTAG_average", "FTG_nearest_diff", "FTLG_nearest_diff", "S_nearest_diff", "ST_nearest_diff", 
             "C_nearest_diff", "Team1_HFTR_nearest", "Team2_AFTR_nearest", "Referee_HomeTeam_points",
             "Referee_AwayTeam_points", "Referee_Draw_points", "BK_Teams_diff", "BK_Draw"]

In [123]:
Teams3 = train3['HomeTeam'].append(test3['HomeTeam']).unique() # Список всех команд за 8 лет в 2 английских дивизионах
Teams_HomeData3 = {Teams3[i]: start_seasonses3[start_seasonses3["HomeTeam"] == Teams3[i]] for i in range(len(Teams3))} # домашние матчи каждой прогназируемой команды
Teams_AwayData3 = {Teams3[i]: start_seasonses3[start_seasonses3["AwayTeam"] == Teams3[i]] for i in range(len(Teams3))} # гостевые матчи каждой прогназируемой команды

Referees3 = train3['Referee'].append(test3['Referee']).unique()
Referees_Data3 = {Referees3[i]: start_seasonses3[start_seasonses3["Referee"] == Referees3[i]] for i in range(len(Referees3))} # матчи каждого рефери


In [124]:
train_X3 = pd.DataFrame(columns = features3)
train_Y3 = []
test_X3 = pd.DataFrame(columns = features3)
test_Y3 = test3["FTR"]

# выборка на 7 сезонах для обучения

for i in range(len(train3)):
    row = train3.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    Referee = row["Referee"]
    
    if (len(Teams_HomeData3[Team1]) >= 10) & (len(Teams_AwayData3[Team2]) >= 10) & (len(Referees_Data3[Referee]) >= 10):
    # если команда только влетела в каком-то сезоне, то на неё нет достаточно информации
        BK =  row[attribute_bk]
        feature = generate_mix_features(Teams_HomeData3[Team1], Teams_AwayData3[Team1], Teams_HomeData3[Team2], 
                                        Teams_AwayData3[Team2], Referees_Data3[Referee], BK)
        train_X3.loc[i] = feature
        train_Y3.append(train3["FTR"].loc[i])
        
    Teams_HomeData3[Team1] = Teams_HomeData3[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData3[Team2] = Teams_AwayData3[Team2].append(row, ignore_index=True)
    Referees_Data3[Referee] = Referees_Data3[Referee].append(row, ignore_index=True)

# выборка на 3 неполных сезонах для построения прогнозов

for i in range(len(test3)):
    row = test3.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    Referee = row["Referee"]
    BK =  row[attribute_bk]
    
    feature = generate_mix_features(Teams_HomeData3[Team1], Teams_AwayData3[Team1], Teams_HomeData3[Team2], 
                                    Teams_AwayData3[Team2], Referees_Data3[Referee], BK)
    test_X3.loc[i] = feature
    Teams_HomeData3[Team1] = Teams_HomeData3[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData3[Team2] = Teams_AwayData3[Team2].append(row, ignore_index=True)
    Referees_Data3[Referee] = Referees_Data3[Referee].append(row, ignore_index=True)

In [125]:
tuned_parameters = [{'min_samples_split': [1, 2], 'max_depth': [2, 3, 4], 'learning_rate': [0.02, 0.03, 0.04],
                     'min_samples_leaf':[25, 50, 75], 'n_estimators': [30, 50, 70], "subsample": [1, 0.9]}]

GB_classifier3 = GridSearchCV(GradientBoostingClassifier(), tuned_parameters)
GB_classifier3.fit(train_X3, train_Y3)
print(GB_classifier3.best_estimator_)

GradientBoostingClassifier(init=None, learning_rate=0.02, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=75, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=0.9, verbose=0,
              warm_start=False)


In [126]:
test_X3 = test_X3.replace(to_replace="NaN", value=0) # это не повлияет на результаты, так как в фичах берётся максимум из всех БК

In [142]:
GB_classifier3 = GradientBoostingClassifier(min_samples_split = 2, max_depth = 4, learning_rate = 0.03, min_samples_leaf = 50,
                                            subsample = 0.9, n_estimators = 50, random_state = 42)
GB_classifier3.fit(train_X3, train_Y3)

print("На тренировочной выборке: " + str(accuracy_score(train_Y3, GB_classifier3.predict(train_X3))))

proba_GB3 = GB_classifier3.predict_proba(test_X3)
prediction_GB3 = GB_classifier3.predict(test_X3)

result_GB3 = pd.DataFrame()
result_GB3[0] = proba_GB3[:,0]
result_GB3[1] = proba_GB3[:,1]
result_GB3[2] = proba_GB3[:,2]
result_GB3[3] = prediction_GB3
result_GB3[4] = test_Y3

result_GB3.columns = ["A", "D", "H", "prediction", "result"]

score_GB3 = accuracy_score(test_Y3, result_GB3["prediction"])
print("На тестовой выборке: " + str(score_GB3))

result_GB3[100:115]

На тренировочной выборке: 0.575038284839
На тестовой выборке: 0.538596491228


Unnamed: 0,A,D,H,prediction,result
100,0.318807,0.299124,0.382069,H,H
101,0.363918,0.303153,0.332928,A,A
102,0.153891,0.225776,0.620333,H,H
103,0.682404,0.144765,0.172832,A,D
104,0.124126,0.239297,0.636577,H,H
105,0.199224,0.25712,0.543656,H,D
106,0.507147,0.25941,0.233442,A,D
107,0.570107,0.226971,0.202921,A,H
108,0.204749,0.255847,0.539405,H,A
109,0.256274,0.301876,0.44185,H,A


In [143]:
pd.DataFrame([GB_classifier3.feature_importances_], columns = train_X3.columns)

Unnamed: 0,Team1_FTHG_average,Team2_FTAG_average,FTG_nearest_diff,FTLG_nearest_diff,S_nearest_diff,ST_nearest_diff,C_nearest_diff,Team1_HFTR_nearest,Team2_AFTR_nearest,Referee_HomeTeam_points,Referee_AwayTeam_points,Referee_Draw_points,BK_Teams_diff,BK_Draw
0,0.07411,0.061163,0.027631,0.051685,0.063498,0.101564,0.046001,0.034113,0.014436,0.015487,0.020264,0.01207,0.406308,0.07167


40% зависит от катировок букмекеров.

In [45]:
result_GB3["prediction"].value_counts()

H    796
A    335
D      9
Name: prediction, dtype: int64

In [46]:
confusion_matrix(test_Y3, result_GB3["prediction"])

array([[175,   5, 160],
       [ 77,   1, 206],
       [ 83,   3, 430]])

Чуть больше половины мы угадываем, примерно в 24% случаях наши результаты уходят на каждый из оставшихся исходов. Ничьи никак не получается прогнозировать. Поэтому попробуем сделать допущение - "Если модель выдаёт близкие вероятности на победы одной и второй команды, то говорим что они сыграют в ничью".

In [47]:
print("До: " + str(accuracy_score(train_Y3, GB_classifier3.predict(train_X3))))

proba_GB3_fix = GB_classifier3.predict_proba(train_X3)
prediction_GB3_fix = GB_classifier3.predict(train_X3)

result_GB3_fix = pd.DataFrame()
result_GB3_fix[0] = proba_GB3_fix[:,0]
result_GB3_fix[1] = proba_GB3_fix[:,1]
result_GB3_fix[2] = proba_GB3_fix[:,2]

# Если команды почти равны по силе, то ставим на ничью
for i in range(len(result_GB3_fix)):
    if abs(result_GB3_fix.ix[i][0]-result_GB3_fix.ix[i][2]) < 0.03:
        prediction_GB3_fix[i] = "D"

result_GB3_fix[3] = prediction_GB3_fix
result_GB3_fix[4] = train_Y3

result_GB3_fix.columns = ["A", "D", "H", "prediction", "result"]

print("После: " + str(accuracy_score(result_GB3_fix["prediction"], result_GB3_fix["result"])))
confusion_matrix(train_Y3, result_GB3_fix["prediction"])

До: 0.56010719755
После: 0.560872894334


array([[ 355,   38,  342],
       [ 155,   52,  458],
       [ 113,   41, 1058]])

С одной стороны процент правильно спрогнозированных матчей вырос. Но с другой стороны [32, 38, 37] говорит о том что ставя в таких случаях на ничью процент правильных прогнозов становититься равным 52/(38+52+41) или примерно 40%. Но, увеличивая порог для ничейных результатов, получаем результаты ничьих сходящиеся к 33.3%

# Логистическая регрессия

## На первом наборе данных

In [48]:
tuned_parameters = [{'C': [0.4, 0.6, 0.8, 1, 1.5], 'tol': [0.0001, 0.0002, 0.0003], 'max_iter': [50, 100, 150],
                     'solver': ['newton-cg', 'lbfgs', 'liblinear']}]

GB_classifier = GridSearchCV(LogisticRegression(), tuned_parameters)
GB_classifier.fit(train_X, train_Y)
print(GB_classifier.best_estimator_)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)


In [247]:
# Логистическая регрессия на статистических данных

LR_classifier = LogisticRegression(random_state = 42, C=0.4, tol=0.0001, max_iter=50, solver='lbfgs')
LR_classifier.fit(train_X, train_Y)

print("На тренировочной выборке со статистическими данными: " + str(accuracy_score(train_Y, LR_classifier.predict(train_X))))

proba_LR = LR_classifier.predict_proba(test_X)
prediction_LR = LR_classifier.predict(test_X)

result_LR = pd.DataFrame()
result_LR[0] = proba_LR[:,0]
result_LR[1] = proba_LR[:,1]
result_LR[2] = proba_LR[:,2]
result_LR[3] = prediction_LR
result_LR[4] = test_Y

result_LR.columns = ["A", "D", "H", "prediction", "result"]


score_LR = accuracy_score(test_Y, result_LR["prediction"])
print("На тестовой выборке со статистическими данными: " + str(score_LR))

confusion_matrix(test_Y, result_LR["prediction"])

На тренировочной выборке со статистическими данными: 0.456581183948
На тестовой выборке со статистическими данными: 0.514912280702


array([[189,   1, 150],
       [ 98,   0, 186],
       [118,   0, 398]])

In [250]:
print("До: " + str(accuracy_score(test_Y, result_LR["prediction"])))
proba_LR_fix = LR_classifier.predict_proba(test_X)
prediction_LR_fix = LR_classifier.predict(test_X)

result_LR_fix = pd.DataFrame()
result_LR_fix[0] = proba_LR_fix[:,0]
result_LR_fix[1] = proba_LR_fix[:,1]
result_LR_fix[2] = proba_LR_fix[:,2]

# Если команды почти равны по силе, то ставим на ничью
for i in range(len(result_LR_fix)):
    if abs(result_LR_fix.ix[i][0]-result_LR_fix.ix[i][2]) < 0.03:
        prediction_LR_fix[i] = "D"
        
result_LR_fix[3] = prediction_LR_fix
result_LR_fix[4] = test_Y

result_LR_fix.columns = ["A", "D", "H", "prediction", "result"]

score_LR_fix = accuracy_score(test_Y, result_LR_fix["prediction"])
print("После: " + str(score_LR_fix))

confusion_matrix(test_Y, result_LR_fix["prediction"])

До: 0.514912280702
После: 0.514912280702


array([[173,  32, 135],
       [ 79,  37, 168],
       [ 98,  41, 377]])

## На втором наборе данных

In [51]:
tuned_parameters = [{'C': [0.4, 0.6, 0.8, 1, 1.5], 'tol': [0.0001, 0.0002, 0.0003], 'max_iter': [50, 100, 150],
                     'solver': ['newton-cg', 'lbfgs', 'liblinear']}]

LR_classifier2 = GridSearchCV(LogisticRegression(), tuned_parameters)
LR_classifier2.fit(train_X2, train_Y2)
print(LR_classifier2.best_estimator_)

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)


In [228]:
print("До: " + str(accuracy_score(test_Y2, result_LR2["prediction"])))
proba_LR2_fix = LR_classifier2.predict_proba(test_X2)
prediction_LR2_fix = LR_classifier2.predict(test_X2)

result_LR2_fix = pd.DataFrame()
result_LR2_fix[0] = proba_LR2_fix[:,0]
result_LR2_fix[1] = proba_LR2_fix[:,1]
result_LR2_fix[2] = proba_LR2_fix[:,2]

# Если команды почти равны по силе, то ставим на ничью
for i in range(len(result_LR2_fix)):
    if abs(result_LR2_fix.ix[i][0]-result_LR2_fix.ix[i][2]) < 0.03:
        prediction_LR2_fix[i] = "D"
        
result_LR2_fix[3] = prediction_LR2_fix
result_LR2_fix[4] = test_Y2

result_LR2_fix.columns = ["A", "D", "H", "prediction", "result"]

score_LR2_fix = accuracy_score(test_Y2, result_LR2_fix["prediction"])
print("После: " + str(score_LR2_fix))

confusion_matrix(test_Y2, result_LR2_fix["prediction"])

До: 0.521929824561
После: 0.531578947368


array([[180,  18, 142],
       [ 89,  28, 167],
       [105,  13, 398]])

In [227]:
LR_classifier2 = LogisticRegression(random_state = 42, C=1.5, tol=0.0001, max_iter=50, solver='liblinear')
LR_classifier2.fit(train_X2, train_Y2)

print("На тренировочной выборке со статистическими данными: " + str(accuracy_score(train_Y2, LR_classifier2.predict(train_X2))))

proba_LR2 = LR_classifier2.predict_proba(test_X2)
prediction_LR2 = LR_classifier2.predict(test_X2)

result_LR2 = pd.DataFrame()
result_LR2[0] = proba_LR2[:,0]
result_LR2[1] = proba_LR2[:,1]
result_LR2[2] = proba_LR2[:,2]
result_LR2[3] = prediction_LR2
result_LR2[4] = test_Y2

result_LR2.columns = ["A", "D", "H", "prediction", "result"]

score_LR2 = accuracy_score(test_Y2, result_LR2["prediction"])
print("На тестовой выборке со статистическими данными: " + str(score_LR2))

confusion_matrix(test_Y2, result_LR2["prediction"])

На тренировочной выборке со статистическими данными: 0.528713629403
На тестовой выборке со статистическими данными: 0.521929824561


array([[189,   1, 150],
       [100,   2, 182],
       [110,   2, 404]])

## На третьем наборе данных

In [54]:
tuned_parameters = [{'C': [0.4, 0.6, 0.8, 1, 1.5], 'tol': [0.0001, 0.0002, 0.0003], 'max_iter': [50, 100, 150],
                     'solver': ['newton-cg', 'lbfgs', 'liblinear']}]

LR_classifier3 = GridSearchCV(LogisticRegression(), tuned_parameters)
LR_classifier3.fit(train_X3, train_Y3)
print(LR_classifier3.best_estimator_)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [235]:
LR_classifier3 = LogisticRegression(random_state = 42, C=0.4, tol=0.0001, max_iter=50, solver='liblinear')
LR_classifier3.fit(train_X3, train_Y3)

print("На тренировочной выборке со статистическими данными: " + str(accuracy_score(train_Y3, LR_classifier3.predict(train_X3))))

proba_LR3 = LR_classifier3.predict_proba(test_X3)
prediction_LR3 = LR_classifier3.predict(test_X3)

result_LR3 = pd.DataFrame()
result_LR3[0] = proba_LR3[:,0]
result_LR3[1] = proba_LR3[:,1]
result_LR3[2] = proba_LR3[:,2]
result_LR3[3] = prediction_LR3
result_LR3[4] = test_Y3

result_LR3.columns = ["A", "D", "H", "prediction", "result"]

score_LR3 = accuracy_score(test_Y3, result_LR3["prediction"])
print("На тестовой выборке со статистическими данными: " + str(score_LR3))

confusion_matrix(test_Y3, result_LR3["prediction"])

На тренировочной выборке со статистическими данными: 0.545941807044
На тестовой выборке со статистическими данными: 0.535087719298


array([[175,   4, 161],
       [ 79,   4, 201],
       [ 82,   3, 431]])

In [243]:
print("До: " + str(accuracy_score(test_Y3, result_LR3["prediction"])))
proba_LR3_fix = LR_classifier3.predict_proba(test_X3)
prediction_LR3_fix = LR_classifier3.predict(test_X3)

result_LR3_fix = pd.DataFrame()
result_LR3_fix[0] = proba_LR3_fix[:,0]
result_LR3_fix[1] = proba_LR3_fix[:,1]
result_LR3_fix[2] = proba_LR3_fix[:,2]

# Если команды почти равны по силе, то ставим на ничью
for i in range(len(result_LR3_fix)):
    if abs(result_LR3_fix.ix[i][0]-result_LR3_fix.ix[i][2]) < 0.02:
        prediction_LR3_fix[i] = "D"
        
result_LR3_fix[3] = prediction_LR3_fix
result_LR3_fix[4] = test_Y3

result_LR3_fix.columns = ["A", "D", "H", "prediction", "result"]

score_LR3_fix = accuracy_score(test_Y3, result_LR3_fix["prediction"])
print("После: " + str(score_LR3_fix))

confusion_matrix(test_Y3, result_LR3_fix["prediction"])

До: 0.535087719298
После: 0.535964912281


array([[171,  18, 151],
       [ 74,  18, 192],
       [ 77,  17, 422]])

# Промежуточные результаты

Как понять, насколько хороши эти прогнозы? Для этого посмотрим чтобы было, если бы мы всегда ставили на фаворитов по букмекерским коэффициентам.

In [254]:
favorite=[]
BK_Team1 = test["IWH"]
BK_Team2 = test["IWA"]

for i in range(len(BK_Team1)):
    if BK_Team1.loc[i] - BK_Team2.loc[i] < 0:
        favorite.append("H")
    elif BK_Team1.loc[i] - BK_Team2.loc[i] > 0:
        favorite.append("A")
    else:
        favorite.append("D")

score_favorite = accuracy_score(favorite, test_Y)
print(score_favorite)

confusion_matrix(test_Y, favorite)

0.534210526316


array([[175,  18, 147],
       [ 76,  14, 194],
       [ 82,  14, 420]])

In [272]:
favorite2=[]
BK_Team1 = test2_HA["B365H"]
BK_Team2 = test2_HA["B365A"]

for i in range(len(BK_Team1)):
    if BK_Team1.loc[i] - BK_Team2.loc[i] < 0:
        favorite2.append("H")
    else:
        favorite2.append("A")

        
print(accuracy_score(favorite2, test_Y2_HA))

confusion_matrix(favorite2, test_Y2_HA)

0.716121495327


array([[189,  92],
       [151, 424]])

In [267]:
favorite2

['H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'A',
 'H',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'A',
 'A',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'A',
 'H',
 'A',
 'A',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'A',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'H',
 'A',
 'A',
 'H',
 'H',
 'A',
 'H',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'A',
 'A',
 'A',
 'H',
 'H',
 'H',
 'A',
 'H',
 'H',
 'H',
 'H',
 'H',
 'A',
 'H'

Модели построеные только на статистических данных, имеет чуть меньший результат угадываний, чем способ ставить всё время на фаворитов. Модели построенная на БК коэффициентах с добавление шума, в виде статистических данных и поднятием ничейных исходов, имеет схожий процент верно угаданных исходов. Однако, этот шум может отклонить результаты и в любую сторону. Раз такая беда с ничьими, порбоуем немного другую модель.

# Победа гостевой или домашней команды. HA

Так как модель ни в какую не прогнозирует ничьи, то поступим так. Попробуем прогнозировать только победу домашней и гостевой команд. В таких случаях в БК существует отдельное событие "Результат, не включая ничью". Раньше было 3 исхода и 3 коэффициента на них, теперь остаётся теже 3 исхода, но 2 коэффициента (на победу гостевой и домашней команд). В случаи ничьи деньги возвращаются обратно. Классификаторы будут теже, изменяться только вероятности на исходы. Их изначально нету, но если ввести предположение, что маржа букмекера на эти события одинакова, то из первых вероятностей можно получить вторые.

In [326]:
def reformate_coef(H_coef, D_coef, A_coef):
    H_prob = 1/H_coef
    D_prob = 1/D_coef
    A_prob = 1/A_coef
    margin = (H_prob + D_prob + A_prob)
    H_prob_new = margin * H_prob/(H_prob + A_prob)
    A_prob_new = margin * A_prob/(H_prob + A_prob)
    
    return [1/H_prob_new, 1, 1/A_prob_new]

In [258]:
T1 = test.ix[0]["B365H"]
T2 = test.ix[0]["B365A"]
TX = test.ix[0]["B365D"]
M = round(((1/T1 + 1/T2 + 1/TX)-1)*100, 4)

print("Обычные коэффициенты: " + "[" + str(T1) + ", " + str(TX) + ", " + str(T2) + "];")
print("Маржа букмекера: " + str(M) + "%;")
[T1_new, TX_new, T2_new] = reformate_coef(T1, T2, TX)
M_new = round(((1/T1_new + 1/T2_new)-1)*100,4) 
      
print("Новые коэффициенты: " + "[" + str(round(T1_new, 4)) + ", " + str(round(T2_new, 4)) + "];")
print("Новая маржа: " + str(M) + "%;")

Обычные коэффициенты: [1.25, 6.5, 15.0];
Маржа букмекера: 2.0513%;
0.8 0.0666666666667 0.153846153846
0.855913978495 0.164598842018
Новые коэффициенты: [1.1683, 6.0754];
Новая маржа: 2.0513%;


In [60]:
# переформировываем обучающую выборку, убирая от туда ничейные результаты.

def reformate_data(X, Y):
    data = X
    data["result"] = Y
    data = data[(data["result"]=="A") | (data["result"]=="H")]
    data.index = [x for x in range(len(data))] # перебивка индексов

    Y_new = data["result"]
    X_new = data
    del X_new["result"]
    
    return [X_new, Y_new]

[train_X2_HA, train_Y2_HA] = reformate_data(train_X2, train_Y2)
[train_X3_HA, train_Y3_HA] = reformate_data(train_X3, train_Y3)

# Так как в тестовой выборке при ничейном исходе нам просто вернут деньги, то эти случаи можно не учитывать в прогнозе
[test_X2_HA, test_Y2_HA] = reformate_data(test_X2, test_Y2)
[test_X3_HA, test_Y3_HA] = reformate_data(test_X3, test_Y3)

## Прогноз HA, на данных со вторым набором фич

Первый набор данных показал плохие прогнозы, как в логистической регрессии, так и в градиентном бустинге. Поэтому Прогнозы на новое событие строим по 2 наборам данных.

In [61]:
tuned_parameters = [{'min_samples_split': [1, 2], 'max_depth': [3, 4], 'learning_rate': [0.03, 0.04],
                     'min_samples_leaf':[30, 50, 70], 'n_estimators': [30, 50, 70], "subsample": [1, 0.9]}]

GB_classifier2_HA = GridSearchCV(GradientBoostingClassifier(), tuned_parameters)
GB_classifier2_HA.fit(train_X2_HA, train_Y2_HA)
print(GB_classifier2_HA.best_estimator_)

GradientBoostingClassifier(init=None, learning_rate=0.04, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=30, min_samples_split=1,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              presort='auto', random_state=None, subsample=1, verbose=0,
              warm_start=False)


In [282]:
GB_classifier2_HA = GradientBoostingClassifier(min_samples_split = 2, max_depth = 4, learning_rate = 0.03, 
                                               min_samples_leaf = 50, n_estimators = 70, subsample=0.9, random_state=42)
GB_classifier2_HA.fit(train_X2_HA, train_Y2_HA)

print("На тренировочной выборке: " + str(accuracy_score(train_Y2_HA, GB_classifier2_HA.predict(train_X2_HA))))

proba_GB2_HA = GB_classifier2_HA.predict_proba(test_X2_HA)
prediction_GB2_HA = GB_classifier2_HA.predict(test_X2_HA)

result_GB2_HA = pd.DataFrame()
result_GB2_HA[0] = proba_GB2_HA[:,0]
result_GB2_HA[1] = proba_GB2_HA[:,1]
result_GB2_HA[2] = prediction_GB2_HA
result_GB2_HA[3] = test_Y2_HA

result_GB2_HA.columns = ["A", "H", "prediction", "result"]

score_GB2_HA = accuracy_score(test_Y2_HA, result_GB2_HA["prediction"])
print("На тестовой выборке: " + str(score_GB2_HA))

print(confusion_matrix(result_GB2_HA["result"], result_GB2_HA["prediction"]))

На тренировочной выборке: 0.744221879815
На тестовой выборке: 0.704439252336
[[170 170]
 [ 83 433]]


In [63]:
tuned_parameters = [{'C': [0.4, 0.6, 0.8, 1, 1.5], 'tol': [0.0001, 0.0002, 0.0003], 'max_iter': [50, 100, 150],
                     'solver': ['newton-cg', 'lbfgs', 'liblinear']}]

LR_classifier2_HA = GridSearchCV(LogisticRegression(), tuned_parameters)
LR_classifier2_HA.fit(train_X2_HA, train_Y2_HA)
print(LR_classifier2_HA.best_estimator_)

LogisticRegression(C=0.6, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)


In [287]:
LR_classifier2_HA = LogisticRegression(random_state = 42, C=0.8, tol=0.0001, max_iter=50, solver='lbfgs')
LR_classifier2_HA.fit(train_X2_HA, train_Y2_HA)

print("На тренировочной выборкe: " + str(accuracy_score(train_Y2_HA, LR_classifier2_HA.predict(train_X2_HA))))

proba_LR2_HA = LR_classifier2_HA.predict_proba(test_X2_HA)
prediction_LR2_HA = LR_classifier2_HA.predict(test_X2_HA)

result_LR2_HA = pd.DataFrame()
result_LR2_HA[0] = proba_LR2_HA[:,0]
result_LR2_HA[1] = proba_LR2_HA[:,1]
result_LR2_HA[2] = prediction_LR2_HA
result_LR2_HA[3] = test_Y2_HA

result_LR2_HA.columns = ["A", "H", "prediction", "result"]

score_LR2_HA = accuracy_score(test_Y2_HA, result_LR2_HA["prediction"])
print("На тестовой выборке: " + str(score_LR2_HA))

confusion_matrix(test_Y2_HA, result_LR2_HA["prediction"])

На тренировочной выборкe: 0.713405238829
На тестовой выборке: 0.69976635514


array([[191, 149],
       [108, 408]])

По двум исходам прогноз, даже на статистических данных, заметно вырос. Попробуем построить на третьем наборе фич.

## Прогноз HA, на данных с третьим набором фич

In [65]:
tuned_parameters = [{'min_samples_split': [1, 2], 'max_depth': [2, 3, 4], 'learning_rate': [0.03, 0.04],
                     'min_samples_leaf':[30, 50, 70], 'n_estimators': [30, 50, 70, 90], "subsample": [1, 0.9]}]

GB_classifier3_HA = GridSearchCV(GradientBoostingClassifier(), tuned_parameters)
GB_classifier3_HA.fit(train_X3_HA, train_Y3_HA)
print(GB_classifier3_HA.best_estimator_)

GradientBoostingClassifier(init=None, learning_rate=0.04, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=70, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              presort='auto', random_state=None, subsample=0.9, verbose=0,
              warm_start=False)


In [446]:
GB_classifier3_HA = GradientBoostingClassifier(min_samples_split = 1, max_depth = 3, learning_rate = 0.04, 
                                               min_samples_leaf = 30, n_estimators = 50, subsample=0.9, random_state=42)
GB_classifier3_HA.fit(train_X3_HA, train_Y3_HA)

print("На тренировочной выборке: " + str(accuracy_score(train_Y3_HA, GB_classifier3_HA.predict(train_X3_HA))))

proba_GB3_HA = GB_classifier3_HA.predict_proba(test_X3_HA)
prediction_GB3_HA = GB_classifier3_HA.predict(test_X3_HA)

result_GB3_HA = pd.DataFrame()
result_GB3_HA[0] = proba_GB3_HA[:,0]
result_GB3_HA[1] = proba_GB3_HA[:,1]
result_GB3_HA[2] = prediction_GB3_HA
result_GB3_HA[3] = test_Y3_HA

result_GB3_HA.columns = ["A", "H", "prediction", "result"]

score_GB3_HA = accuracy_score(test_Y3_HA, result_GB3_HA["prediction"])
print("На тестовой выборке: " + str(score_GB3_HA))

print(confusion_matrix(result_GB3_HA["result"], result_GB3_HA["prediction"]))

На тренировочной выборке: 0.752439650745
На тестовой выборке: 0.716121495327
[[185 155]
 [ 88 428]]


In [67]:
tuned_parameters = [{'C': [0.4, 0.6, 0.8, 1, 1.5], 'tol': [0.0001, 0.0002, 0.0003], 'max_iter': [50, 100, 150],
                     'solver': ['newton-cg', 'lbfgs', 'liblinear']}]

LR_classifier3_HA = GridSearchCV(LogisticRegression(), tuned_parameters)
LR_classifier3_HA.fit(train_X3_HA, train_Y3_HA)
print(LR_classifier3_HA.best_estimator_)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)


In [321]:
LR_classifier3_HA = LogisticRegression(random_state = 42, C=0.6, tol=0.0001, max_iter=50, solver='lbfgs')
LR_classifier3_HA.fit(train_X3_HA, train_Y3_HA)

print("На тренировочной выборке: " + str(accuracy_score(train_Y3_HA, LR_classifier3_HA.predict(train_X3_HA))))

proba_LR3_HA = LR_classifier3_HA.predict_proba(test_X3_HA)
prediction_LR3_HA = LR_classifier3_HA.predict(test_X3_HA)

result_LR3_HA = pd.DataFrame()
result_LR3_HA[0] = proba_LR3_HA[:,0]
result_LR3_HA[1] = proba_LR3_HA[:,1]
result_LR3_HA[2] = prediction_LR3_HA
result_LR3_HA[3] = test_Y3_HA

result_LR3_HA.columns = ["A", "H", "prediction", "result"]

score_LR3_HA = accuracy_score(test_Y3_HA, result_LR3_HA["prediction"])
print("На тестовой выборке: " + str(score_LR3_HA))

confusion_matrix(test_Y3_HA, result_LR3_HA["prediction"])

На тренировочной выборке: 0.735490498202
На тестовой выборке: 0.709112149533


array([[168, 172],
       [ 77, 439]])

# А что если делать ставки на эти прогнозы?

Теперь спрогнозируем ситуацию - имеется 100 у.е. и мы ставим на матчи, учитывая результаты нашей модели. Посмотрим к чему это приведёт. Начнём с элементарного, на каждый матч ставим 1 у.е. 

In [69]:
[len(test)==len(test_X), len(test2)==len(test_X2), len(test3)==len(test_X3)]

[True, True, True]

Для начала предположим что у нас есть аккаунт только в одной БК.

In [70]:
def test_prediction(data, prediction, result, rate):
    BK_coef = data[["B365A", "B365D", "B365H"]]
    BK_coef.columns = ["A", "D", "H"]
    cash = 100 
    cash_history = []
    
    for i in range(len(prediction)):
        cash = cash - rate
        if result[i]==prediction[i]:
            cash = cash + rate*BK_coef[result[i]].loc[i]
        cash_history.append(cash)
            
    return cash_history

def test_prediction_HA(data, prediction, result, rate):
    BK_coef = data[["B365A", "B365H"]]
    BK_coef.columns = ["A", "H"]
    cash = 100 
    cash_history = []
    
    for i in range(len(prediction)):
        cash = cash - rate
        if result[i]==prediction[i]:
            cash = cash + rate*BK_coef[result[i]].loc[i]
        cash_history.append(cash)
            
    return cash_history

In [327]:
Result = pd.DataFrame()
Result["favorite"] = test_prediction(test, favorite, test_Y, 1)
Result["Cash_GB"] = test_prediction(test, prediction_GB, test_Y, 1)
Result["Cash_GB2"] = test_prediction(test2, prediction_GB2, test_Y2, 1)
Result["Cash_GB3"] = test_prediction(test3, prediction_GB3, test_Y3, 1)
Result["Cash_LR"] = test_prediction(test, prediction_LR, test_Y, 1)
Result["Cash_LR_fix"] = test_prediction(test, prediction_LR_fix, test_Y, 1)
Result["Cash_LR2"] = test_prediction(test2, prediction_LR2, test_Y2, 1)
Result["Cash_LR2_fix"] = test_prediction(test2, prediction_LR2_fix, test_Y2, 1)
Result["Cash_LR3"] = test_prediction(test3, prediction_LR3, test_Y3, 1)
Result["Cash_LR3_fix"] = test_prediction(test3, prediction_LR3_fix, test_Y3, 1)

# события с двумя исходами
test2_HA = test2[(test2["FTR"]=="A") | (test2["FTR"]=="H")][["B365H", "B365D", "B365A"]]
test2_HA.index = [x for x in range(len(test2_HA))]
for i in range(len(test2_HA)):
    row = test2_HA.loc[i]
    test2_HA.loc[i] = reformate_coef(row["B365H"], row["B365D"], row["B365A"]) 
test3_HA = test3[(test3["FTR"]=="A") | (test3["FTR"]=="H")][["B365H", "B365D", "B365A"]]
test3_HA.index = [x for x in range(len(test3_HA))]
for i in range(len(test3_HA)):
    row = test3_HA.loc[i]
    test3_HA.loc[i] = reformate_coef(row["B365H"], row["B365D"], row["B365A"]) 

Result_HA = pd.DataFrame()
Result_HA["Cash_GB2_HA"] = test_prediction_HA(test2_HA, prediction_GB2_HA, test_Y2_HA, 1)
Result_HA["Cash_GB3_HA"] = test_prediction_HA(test3_HA, prediction_GB3_HA, test_Y3_HA, 1)
Result_HA["Cash_LR2_HA"] = test_prediction_HA(test2_HA, prediction_LR2_HA, test_Y2_HA, 1)
Result_HA["Cash_LR3_HA"] = test_prediction_HA(test3_HA, prediction_LR3_HA, test_Y3_HA, 1)

In [325]:
Result_HA[-10:]

Unnamed: 0,Cash_GB2_HA,Cash_GB3_HA,Cash_LR2_HA,Cash_LR3_HA
846,89.962486,79.558514,76.084703,70.041103
847,90.129003,79.725031,76.25122,70.20762
848,89.129003,78.725031,75.25122,69.20762
849,89.146238,78.742266,75.268455,69.224855
850,89.336656,78.932683,75.458873,69.415272
851,89.353891,78.949918,75.476108,69.432507
852,90.115233,79.71126,76.23745,70.193849
853,89.115233,78.71126,75.23745,69.193849
854,89.631899,79.227927,75.754116,69.710516
855,89.720889,79.316917,75.843106,69.799506


In [73]:
Result[-10:]

Unnamed: 0,favorite,Cash_GB,Cash_GB2,Cash_GB3,Cash_LR,Cash_LR_fix,Cash_LR2,Cash_LR2_fix,Cash_LR3,Cash_LR3_fix
1130,73.74,89.32,91.64,57.09,39.41,88.77,74.38,75.83,57.13,79.6
1131,72.74,88.32,90.64,56.09,38.41,87.77,73.38,74.83,56.13,78.6
1132,72.88,88.46,90.78,56.23,38.55,87.91,73.52,74.97,56.27,78.74
1133,73.38,88.96,91.28,56.73,39.05,88.41,74.02,75.47,56.77,79.24
1134,72.38,87.96,90.28,55.73,38.05,87.41,73.02,74.47,55.77,78.24
1135,72.52,88.1,90.42,55.87,38.19,87.55,73.16,74.61,55.91,78.38
1136,74.02,89.6,91.92,57.37,39.69,89.05,74.66,76.11,57.41,79.88
1137,73.02,88.6,90.92,56.37,38.69,88.05,73.66,75.11,56.41,78.88
1138,74.12,89.7,92.02,57.47,39.79,89.15,74.76,76.21,57.51,79.98
1139,74.42,90.0,92.32,57.77,40.09,89.45,75.06,76.51,57.81,80.28


Очевидно, что не обязательно ставить на все события одинаковую сумму. Оказывается, существует такой критерий Келли, который говорит сколько лучше поставить, в %, чтобы на дистанции быть в выйгрыше. cash_% = 100 ***** nu (BK_coef ***** prob-1)/(BK_coef-1)

Так же введём предположение, что БК позволяют ставить сколь угодно большие и сколь угодно маленькие по размеру ставки, а так же ставки со сколь угодно большим количество дробных знаков(например 0.01403892551725 у.е.)

In [74]:
# nu - параметр доверия, от 0 до 1, чем он выше, тем больше можно потерять. На выходе число от 0 до 1, которое говорит, какую часть своих средств нужно ставить
def Kelli(BK_coef, prob, nu):
    return nu*(BK_coef*prob - 1)/(BK_coef - 1)

In [338]:
def test_prediction_Kelli(data, prediction, prob, result, nu):
    BK_coef = data[["B365A", "B365D", "B365H"]]
    BK_coef.columns = ["A", "D", "H"]
    prob = pd.DataFrame(prob, columns = ["A", "D", "H"])
    cash = 100 
    cash_history = []
    
    for i in range(len(prediction)):
        for adh in ("A", "D", "H"):
            rate = Kelli(BK_coef.loc[i][adh], prob.loc[i][adh], nu)
            if rate>0:
                if result[i]==adh:
                    cash = cash*(1 + rate*(BK_coef.loc[i][adh]-1))
                else:
                    cash = cash*(1-rate)
        cash_history.append(cash)
            
    return cash_history

def test_prediction_HA_Kelli(data, prediction, prob, result, nu):
    BK_coef = data[["B365A", "B365H"]]
    BK_coef.columns = ["A", "H"]
    prob = pd.DataFrame(prob, columns = ["A", "H"])
    cash = 100 
    cash_history = []
    
    for i in range(len(prediction)):
        for ah in ("A", "H"):
            rate = Kelli(BK_coef.loc[i][ah], prob.loc[i][ah], nu)
            if rate>0:
                if result[i]==ah:
                    cash = cash + rate*cash*(BK_coef.loc[i][ah]-1)
                else:
                    cash = cash*(1-rate)
        cash_history.append(cash)
            
    return cash_history

In [339]:
nu = 0.1
def make_prediction(nu):
    Result_Kelli = pd.DataFrame()
    Result_Kelli["Cash_GB"] = test_prediction_Kelli(test, prediction_GB, proba_GB, test_Y, nu)
    Result_Kelli["Cash_GB2"] = test_prediction_Kelli(test2, prediction_GB2, proba_GB2, test_Y2, nu)
    Result_Kelli["Cash_GB3"] = test_prediction_Kelli(test3, prediction_GB3, proba_GB3, test_Y3, nu)
    Result_Kelli["Cash_LR"] = test_prediction_Kelli(test, prediction_LR, proba_LR, test_Y, nu)
    Result_Kelli["Cash_LR_fix"] = test_prediction_Kelli(test, prediction_LR_fix, proba_LR_fix, test_Y, nu)
    Result_Kelli["Cash_LR2"] = test_prediction_Kelli(test2, prediction_LR2, proba_LR2, test_Y2, nu)
    Result_Kelli["Cash_LR2_fix"] = test_prediction_Kelli(test2, prediction_LR2_fix, proba_LR2_fix, test_Y2, nu)
    Result_Kelli["Cash_LR3"] = test_prediction_Kelli(test3, prediction_LR3, proba_LR3, test_Y3, nu)
    Result_Kelli["Cash_LR3_fix"] = test_prediction_Kelli(test3, prediction_LR3_fix, proba_LR_fix, test_Y3, nu)
    return Result_Kelli

def make_prediction_HA(nu):
    Result_HA_Kelli = pd.DataFrame()
    Result_HA_Kelli["Cash_GB2_HA"] = test_prediction_HA_Kelli(test2_HA, prediction_GB2_HA, proba_GB2_HA, test_Y2_HA, nu)
    Result_HA_Kelli["Cash_GB3_HA"] = test_prediction_HA_Kelli(test3_HA, prediction_GB3_HA, proba_GB3_HA, test_Y3_HA, nu)
    Result_HA_Kelli["Cash_LR2_HA"] = test_prediction_HA_Kelli(test2_HA, prediction_LR2_HA, proba_LR2_HA, test_Y2_HA, nu)
    Result_HA_Kelli["Cash_LR3_HA"] = test_prediction_HA_Kelli(test3_HA, prediction_LR3_HA, proba_LR3_HA, test_Y3_HA, nu)
    return Result_HA_Kelli

In [340]:
for i in range(1, 10):
    print(make_prediction(i/10)[-1:])
    print("\n")

        Cash_GB   Cash_GB2  Cash_GB3    Cash_LR  Cash_LR_fix  Cash_LR2  \
1139  27.767301  26.346442  63.56728  29.155743    29.155743  45.59167   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139      45.59167  88.309358     29.155743  


       Cash_GB  Cash_GB2   Cash_GB3   Cash_LR  Cash_LR_fix   Cash_LR2  \
1139  3.274868  3.595261  34.932227  4.530052     4.530052  14.982678   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139     14.982678  73.495542      4.530052  


       Cash_GB  Cash_GB2   Cash_GB3   Cash_LR  Cash_LR_fix  Cash_LR2  \
1139  0.182376  0.273478  16.738492  0.399999     0.399999  3.616156   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139      3.616156  57.640721      0.399999  


      Cash_GB  Cash_GB2  Cash_GB3  Cash_LR  Cash_LR_fix  Cash_LR2  \
1139  0.00518  0.012245  7.043898  0.02107      0.02107  0.650797   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139      0.650797  42.588316       0.02107  


       Cash_GB  Cash_GB2  Cash_GB3   Cash_LR

In [341]:
for i in range(1,10):
    print(make_prediction_HA(i/10)[-1:])

     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    41.254839    91.725419    67.219556   107.364131
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    11.016621    77.385102    32.753765   111.691555
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     1.960649    60.308489    11.720449   112.526762
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     0.236988    43.566532     3.105557   109.707681
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     0.019673    29.253948     0.611906   103.395744
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     0.001127    18.299383     0.089704    94.061193
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     0.000045    10.682521     0.009754    82.425408
     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     0.000001     5.827649     0.000781    69.370043
      Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855  2.216237e-08     2.974044     0.000046    

# Италия, Серия А

Проверим насколько это будет правдой в другом дивизионе. Для этого построим лучшие модели и посмотрим к чему это всё приведет.

In [79]:
path = "C:/Users/RomanSivolobtsev/Documents/Статистика Серии А/"
start_year = 2005
start_seasonses4 = pd.DataFrame() # 2 сезона для начальной выборки генерирования фич
train4 = pd.DataFrame() # 7 сезонов по которым будет обучаться модель 
test4 = pd.DataFrame() # 3 сезона для прогнозов

for i in range(2):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    I1 = pd.read_csv(folder + 'I1.csv', index_col=False)
    I2 = pd.read_csv(folder + 'I2.csv', index_col=False)
    start_seasonses4 = start_seasonses4.append([I1, I2], ignore_index=True)

for i in range(2, 9):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    I1 = pd.read_csv(folder + 'I1.csv', index_col=False)
    train4 = train4.append([I1], ignore_index=True)
    
for i in range(9, 12):
    folder = path + str(start_year + i) + "-" + str(start_year + i + 1) + "/"
    I1 = pd.read_csv(folder + 'I1.csv', index_col=False)
    test4 = test4.append(I1, ignore_index=True)
    
train4 = train4[pd.notnull(train4['Div'])]
test4 = test4[pd.notnull(test4['Div'])] # некоторые файлы в конце импортируют пустую строчку, эта для их отлова.

In [80]:
test4.index = [x for x in range(len(test4))]
train4.index = [x for x in range(len(train4))] # перебивка индексов, чтобы не было проблем

В данных по Итальянской лиге нету имён рефери. Чуть чуть переформируем выборку 

In [411]:
# Team1 - Home Team, Team2 - Away Team

def generate_mix_features2(Team1_HomeMatch, Team1_AwayMatch, Team2_HomeMatch, Team2_AwayMatch, BK):
    match_significance = np.array([1, 0.7, 0.5, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05])
    match_significance = match_significance/sum(match_significance) # нормировка коэффициентов
    
    Team1_FTHG_average = Team1_HomeMatch['FTHG'].mean() # среднее количество забитых голов домашней командны в её домашних матчах
    Team2_FTAG_average = Team2_AwayMatch['FTAG'].mean() # среднее количество забитых голов гостевой команды в её гостевых матчах 
    FTG_nearest_diff = sum(Team1_HomeMatch['FTHG'][-10:]*match_significance) - sum(Team2_AwayMatch['FTAG'][-10:]*match_significance)
   
    FTLG_nearest_diff = sum(Team1_AwayMatch['FTHG'][-10:]*match_significance)-sum(Team2_HomeMatch['FTAG'][-10:]*match_significance) 
    # разница пропущенных голов домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости
    
    S_nearest_diff = sum(Team1_HomeMatch['HS'][-10:]*match_significance)-sum(Team2_AwayMatch['AS'][-10:]*match_significance) 
    # разница ударов по воротам домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости
    
    ST_nearest_diff = sum(Team1_HomeMatch['HST'][-10:]*match_significance) - sum(Team2_AwayMatch['AST'][-10:]*match_significance) 
    # разница ударов в створ ворот домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости

    C_nearest_diff = sum(Team1_HomeMatch['HC'][-10:]*match_significance) -sum(Team2_AwayMatch['AC'][-10:]*match_significance) 
    # разница угловых домашней и гостевой команд в их последних 10 матчах с коэфициентами значимости   

    Team1_HFTR_nearest = sum(Team1_HomeMatch['FTR'][-30:].replace(to_replace=["H", "D", "A"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 домашних матчей 
    Team2_AFTR_nearest = sum(Team2_AwayMatch['FTR'][-30:].replace(to_replace=["A", "D", "H"], value=[3, 1, 0])) #количество набранных очков командой за последних 20 гостевых матчей
                    
    BK_Teams_diff = max(BK[["B365H", "BWH", "IWH", "LBH", "WHH", "VCH"]]) - max(BK[["B365A", "BWA", "IWA", "LBA", "WHA", "VCA"]])
    # разница коэффициентов на победу домашней и гостевой команд
    BK_Draw = max(BK[["B365D", "BWD", "IWD", "LBD", "WHD", "VCD"]]) # лучший коэффициент на ничью

    return [Team1_FTHG_average, Team2_FTAG_average, FTG_nearest_diff, FTLG_nearest_diff, S_nearest_diff, ST_nearest_diff, 
            C_nearest_diff, Team1_HFTR_nearest, Team2_AFTR_nearest, BK_Teams_diff, BK_Draw]

In [412]:
features4 = ["Team1_FTHG_average", "Team2_FTAG_average", "FTG_nearest_diff", "FTLG_nearest_diff", "S_nearest_diff", "ST_nearest_diff", 
             "C_nearest_diff", "Team1_HFTR_nearest", "Team2_AFTR_nearest", "BK_Teams_diff", "BK_Draw"]

In [413]:
Teams4 = train4['HomeTeam'].append(test4['HomeTeam']).unique() # Список всех команд за 8 лет в 2 английских дивизионах
Teams_HomeData4 = {Teams4[i]: start_seasonses4[start_seasonses4["HomeTeam"] == Teams4[i]] for i in range(len(Teams4))} # домашние матчи каждой прогназируемой команды
Teams_AwayData4 = {Teams4[i]: start_seasonses4[start_seasonses4["AwayTeam"] == Teams4[i]] for i in range(len(Teams4))} # гостевые матчи каждой прогназируемой команды

In [414]:
train_X4 = pd.DataFrame(columns = features4)
train_Y4 = []
test_X4 = pd.DataFrame(columns = features4)
test_Y4 = []

# выборка на 7 сезонах для обучения

for i in range(len(train4)):
    row = train4.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    
    if (len(Teams_HomeData4[Team1]) >= 10) & (len(Teams_AwayData4[Team2]) >= 10):
    # если команда только влетела в каком-то сезоне, то на неё нет достаточно информации
        BK =  row[attribute_bk]
        feature = generate_mix_features2(Teams_HomeData4[Team1], Teams_AwayData4[Team1], Teams_HomeData4[Team2], 
                                        Teams_AwayData4[Team2], BK)
        train_X4.loc[i] = feature
        train_Y4.append(train4["FTR"].loc[i])
        
    Teams_HomeData4[Team1] = Teams_HomeData4[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData4[Team2] = Teams_AwayData4[Team2].append(row, ignore_index=True)

# выборка на 3 неполных сезонах для построения прогнозов

for i in range(len(test4)):
    row = test4.ix[i]
    Team1 = row["HomeTeam"]
    Team2 = row["AwayTeam"]
    BK =  row[attribute_bk]
    
    if (len(Teams_HomeData4[Team1]) >= 10) & (len(Teams_AwayData4[Team2]) >= 10) & (len(Teams_HomeData4[Team2]) >= 10) & (len(Teams_AwayData4[Team1]) >= 10):
    # если команда только влетела в каком-то сезоне, то на неё нет достаточно информации
        BK =  row[attribute_bk]
        feature = generate_mix_features2(Teams_HomeData4[Team1], Teams_AwayData4[Team1], Teams_HomeData4[Team2], 
                                         Teams_AwayData4[Team2], BK)
        test_X4.loc[i] = feature
        test_Y4.append(test4["FTR"].loc[i])
    else:
        test4 = test4.drop(i)
        
    
    Teams_HomeData4[Team1] = Teams_HomeData4[Team1].append(row, ignore_index=True) #после прогноза матча, его исход нужно добавить в данные по прошедшим матчам
    Teams_AwayData4[Team2] = Teams_AwayData4[Team2].append(row, ignore_index=True)

In [415]:
#реформируем данные

train_X4 = train_X4.replace(to_replace="NaN", value=0) 
test_X4 = test_X4.replace(to_replace="NaN", value=0) 

test4.index = [x for x in range(len(test4))]
train4.index = [x for x in range(len(train4))]

[train_X4_HA, train_Y4_HA] = reformate_data(train_X4, train_Y4)
[test_X4_HA, test_Y4_HA] = reformate_data(test_X4, test_Y4)

\item GB = GradientBoostingClassifier(min_samples_split = 2, max_depth = 3, learning_rate = 0.03, random_state = 42, min_samples_leaf = 50, n_estimators = 50, subsample=0.9)

\item GB2 = GradientBoostingClassifier(min_samples_split = 1, max_depth = 3, learning_rate = 0.03, random_state = 42, min_samples_leaf = 25, n_estimators = 30)

\item GB3 = GradientBoostingClassifier(min_samples_split = 2, max_depth = 4, learning_rate = 0.03, min_samples_leaf = 50, subsample = 0.9, n_estimators = 50, random_state = 42)

\item LR = LogisticRegression(random_state = 42, C=0.4, tol=0.0001, max_iter=50, solver='lbfgs')

\item LR2 = LogisticRegression(random_state = 42, C=0.6, tol=0.0001, max_iter=50, solver='lbfgs')

\item LR3 = LogisticRegression(random_state = 42, C=0.4, tol=0.0001, max_iter=50, solver='lbfgs')

\item GB2_HA = GradientBoostingClassifier(min_samples_split = 2, max_depth = 4, learning_rate = 0.03, min_samples_leaf = 50, n_estimators = 70, subsample=0.9, random_state=42)

\item LR2_HA = LogisticRegression(random_state = 42, C=0.6, tol=0.0001, max_iter=50, solver='lbfgs')

\item GB3_HA = GradientBoostingClassifier(min_samples_split = 1, max_depth = 3, learning_rate = 0.04, min_samples_leaf = 30, n_estimators = 50, subsample=0.9, random_state=42)

\item LR3_HA = LogisticRegression(random_state = 42, C=0.6, tol=0.0001, max_iter=50, solver='lbfgs')

\item GB4_HA = GradientBoostingClassifier(min_samples_split = 2, max_depth = 3, learning_rate = 0.03, min_samples_leaf = 70, n_estimators = 50, subsample=0.9, random_state=42)

\item LR4_HA = LogisticRegression(random_state = 42, C=0.5, tol=0.0001, max_iter=50, solver='lbfgs')

In [371]:
tuned_parameters = [{'min_samples_split': [1, 2], 'max_depth': [2, 3, 4], 'learning_rate': [0.03, 0.04],
                     'min_samples_leaf':[30, 50, 70], 'n_estimators': [30, 50, 70, 90], "subsample": [1, 0.9]}]

GB_classifier4_HA = GridSearchCV(GradientBoostingClassifier(), tuned_parameters)
GB_classifier4_HA.fit(train_X4_HA, train_Y4_HA)
print(GB_classifier4_HA.best_estimator_)

GradientBoostingClassifier(init=None, learning_rate=0.03, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=30, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=0.9, verbose=0,
              warm_start=False)


In [430]:
GB_classifier4_HA = GradientBoostingClassifier(min_samples_split = 2, max_depth = 3, learning_rate = 0.05, 
                                               min_samples_leaf = 100, n_estimators = 50, subsample=0.9, random_state=42)
GB_classifier4_HA.fit(train_X4_HA, train_Y4_HA)

print("На тренировочной выборке: " + str(accuracy_score(train_Y4_HA, GB_classifier4_HA.predict(train_X4_HA))))

proba_GB4_HA = GB_classifier4_HA.predict_proba(test_X4_HA)
prediction_GB4_HA = GB_classifier4_HA.predict(test_X4_HA)

result_GB4_HA = pd.DataFrame()
result_GB4_HA[0] = proba_GB4_HA[:,0]
result_GB4_HA[1] = proba_GB4_HA[:,1]
result_GB4_HA[2] = prediction_GB4_HA
result_GB4_HA[3] = test_Y4_HA

result_GB4_HA.columns = ["A", "H", "prediction", "result"]

score_GB4_HA = accuracy_score(test_Y4_HA, result_GB4_HA["prediction"])
print("На тестовой выборке: " + str(score_GB4_HA))

print(confusion_matrix(result_GB4_HA["result"], result_GB4_HA["prediction"]))

На тренировочной выборке: 0.748443983402
На тестовой выборке: 0.739184177998
[[178 143]
 [ 68 420]]


In [431]:
tuned_parameters = [{'C': [0.4, 0.6, 0.8, 1, 1.5], 'tol': [0.0001, 0.0002, 0.0003], 'max_iter': [50, 100, 150],
                     'solver': ['newton-cg', 'lbfgs', 'liblinear']}]

LR_classifier4_HA = GridSearchCV(LogisticRegression(), tuned_parameters)
LR_classifier4_HA.fit(train_X4_HA, train_Y4_HA)
print(LR_classifier4_HA.best_estimator_)

LogisticRegression(C=0.6, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)


In [436]:
LR_classifier4_HA = LogisticRegression(random_state = 42, C=0.5, tol=0.0001, max_iter=50, solver='lbfgs')
LR_classifier4_HA.fit(train_X4_HA, train_Y4_HA)

print("На тренировочной выборке: " + str(accuracy_score(train_Y4_HA, LR_classifier4_HA.predict(train_X4_HA))))

proba_LR4_HA = LR_classifier4_HA.predict_proba(test_X4_HA)
prediction_LR4_HA = LR_classifier4_HA.predict(test_X4_HA)

result_LR4_HA = pd.DataFrame()
result_LR4_HA[0] = proba_LR4_HA[:,0]
result_LR4_HA[1] = proba_LR4_HA[:,1]
result_LR4_HA[2] = prediction_LR4_HA
result_LR4_HA[3] = test_Y4_HA

result_LR4_HA.columns = ["A", "H", "prediction", "result"]

score_LR4_HA = accuracy_score(test_Y4_HA, result_LR4_HA["prediction"])
print("На тестовой выборке: " + str(score_LR4_HA))

confusion_matrix(test_Y4_HA, result_LR4_HA["prediction"])

На тренировочной выборке: 0.726659751037
На тестовой выборке: 0.737948084054


array([[180, 141],
       [ 71, 417]])

In [437]:
test4_HA = test4[(test4["FTR"]=="A") | (test4["FTR"]=="H")][["B365H", "B365D", "B365A"]]
test4_HA.index = [x for x in range(len(test4_HA))]
for i in range(len(test4_HA)):
    row = test4_HA.loc[i]
    test4_HA.loc[i] = reformate_coef(row["B365H"], row["B365D"], row["B365A"]) 

In [438]:
def make_prediction_Italy_HA(nu):
    Result2_HA_Kelli = pd.DataFrame()
    Result2_HA_Kelli["Cash_GB4_HA"] = test_prediction_HA_Kelli(test4_HA, prediction_GB4_HA, proba_GB4_HA, test_Y4_HA, nu)
    Result2_HA_Kelli["Cash_LR4_HA"] = test_prediction_HA_Kelli(test4_HA, prediction_LR4_HA, proba_LR4_HA, test_Y4_HA, nu)
    return Result2_HA_Kelli

for i in range(1,10):
    print(make_prediction_Italy_HA(i/10)[-1:])

     Cash_GB4_HA  Cash_LR4_HA
808     74.95377    91.280557
     Cash_GB4_HA  Cash_LR4_HA
808    53.418087    80.866389
     Cash_GB4_HA  Cash_LR4_HA
808     36.18414    69.397936
     Cash_GB4_HA  Cash_LR4_HA
808    23.283975    57.556588
     Cash_GB4_HA  Cash_LR4_HA
808     14.22385    45.997429
     Cash_GB4_HA  Cash_LR4_HA
808     8.242255    35.287714
     Cash_GB4_HA  Cash_LR4_HA
808     4.526085    25.859276
     Cash_GB4_HA  Cash_LR4_HA
808      2.35262    17.980726
     Cash_GB4_HA  Cash_LR4_HA
808     1.155995    11.751952


# Порог уверенности. 

В редких случаях получаем + значения. А теперь попробуем ставить не на каждое событие, а на те, на которые наша вероятность больше заданного значения.

In [343]:
def test_prediction_Kelli2(data, prediction, prob, result, nu, step):
    BK_coef = data[["B365A", "B365D", "B365H"]]
    BK_coef.columns = ["A", "D", "H"]
    prob = pd.DataFrame(prob, columns = ["A", "D", "H"])
    cash = 100 
    cash_history = []
    
    for i in range(len(prediction)):
        if prob.loc[i][prediction[i]]>step:
            rate = Kelli(BK_coef.loc[i][prediction[i]], prob.loc[i][prediction[i]], nu)
            if rate>0:
                if result[i]==prediction[i]:
                    cash = cash*(1 + rate*(BK_coef[result[i]].loc[i]-1))
                else:
                    cash = cash*(1-rate)
            cash_history.append(cash)
        else:
            cash_history.append(cash)
            
    return cash_history

def test_prediction_HA_Kelli2(data, prediction, prob, result, nu, step):
    BK_coef = data[["B365A", "B365H"]]
    BK_coef.columns = ["A", "H"]
    prob = pd.DataFrame(prob, columns = ["A", "H"])
    cash = 100 
    cash_history = []
    
    for i in range(len(prediction)):
        if prob.loc[i][prediction[i]]>step:
            rate = Kelli(BK_coef.loc[i][prediction[i]], prob.loc[i][prediction[i]], nu)
            if rate>0:
                if result[i]==prediction[i]:
                    cash = cash + rate*cash*(BK_coef[result[i]].loc[i]-1)
                else:
                    cash = cash*(1-rate)
            cash_history.append(cash)
        else:
            cash_history.append(cash)
            
            
    return cash_history

In [344]:
nu = 0.1
def make_prediction2(nu, step):
    Result_Kelli = pd.DataFrame()
    Result_Kelli["Cash_GB"] = test_prediction_Kelli2(test, prediction_GB, proba_GB, test_Y, nu, step)
    Result_Kelli["Cash_GB2"] = test_prediction_Kelli2(test2, prediction_GB2, proba_GB2, test_Y2, nu, step)
    Result_Kelli["Cash_GB3"] = test_prediction_Kelli2(test3, prediction_GB3, proba_GB3, test_Y3, nu, step)
    Result_Kelli["Cash_LR"] = test_prediction_Kelli2(test, prediction_LR, proba_LR, test_Y, nu, step)
    Result_Kelli["Cash_LR_fix"] = test_prediction_Kelli2(test, prediction_LR_fix, proba_LR_fix, test_Y, nu, step)
    Result_Kelli["Cash_LR2"] = test_prediction_Kelli2(test2, prediction_LR2, proba_LR2, test_Y2, nu, step)
    Result_Kelli["Cash_LR2_fix"] = test_prediction_Kelli2(test2, prediction_LR2_fix, proba_LR2_fix, test_Y2, nu, step)
    Result_Kelli["Cash_LR3"] = test_prediction_Kelli2(test3, prediction_LR3, proba_LR3, test_Y3, nu, step)
    Result_Kelli["Cash_LR3_fix"] = test_prediction_Kelli2(test3, prediction_LR3_fix, proba_LR_fix, test_Y3, nu, step)
    return Result_Kelli

def make_prediction_HA2(nu, step):
    Result_HA_Kelli = pd.DataFrame()
    Result_HA_Kelli["Cash_GB2_HA"] = test_prediction_HA_Kelli2(test2_HA, prediction_GB2_HA, proba_GB2_HA, test_Y2_HA, nu, step)
    Result_HA_Kelli["Cash_GB3_HA"] = test_prediction_HA_Kelli2(test3_HA, prediction_GB3_HA, proba_GB3_HA, test_Y3_HA, nu, step)
    Result_HA_Kelli["Cash_LR2_HA"] = test_prediction_HA_Kelli2(test2_HA, prediction_LR2_HA, proba_LR2_HA, test_Y2_HA, nu, step)
    Result_HA_Kelli["Cash_LR3_HA"] = test_prediction_HA_Kelli2(test3_HA, prediction_LR3_HA, proba_LR3_HA, test_Y3_HA, nu, step)
    return Result_HA_Kelli

In [345]:
for i in range(7, 14):
    print(make_prediction2(0.1, i/20)[-1:])
    print("\n")

         Cash_GB   Cash_GB2   Cash_GB3    Cash_LR  Cash_LR_fix   Cash_LR2  \
1139  102.774183  88.611057  77.997054  49.712072    57.574778  50.632208   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139     54.640977  92.050305     89.324732  


        Cash_GB   Cash_GB2   Cash_GB3    Cash_LR  Cash_LR_fix   Cash_LR2  \
1139  68.513891  83.805021  83.552552  59.213223    60.796344  61.189715   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139     60.796315  98.995471     90.174557  


        Cash_GB   Cash_GB2   Cash_GB3    Cash_LR  Cash_LR_fix   Cash_LR2  \
1139  99.820824  85.766284  85.713698  88.002924    88.002924  77.286992   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139     77.286992  98.475865     89.344727  


       Cash_GB  Cash_GB2   Cash_GB3     Cash_LR  Cash_LR_fix   Cash_LR2  \
1139  104.1669  97.55724  84.463258  104.190714   104.190714  86.892166   

      Cash_LR2_fix   Cash_LR3  Cash_LR3_fix  
1139     86.892166  100.03454    107.282354  


        Cash

In [346]:
for i in range(11, 18):
    print(make_prediction_HA2(0.1, i/20)[-1:])
    print("\n")


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    55.105978    96.033972      77.1504    91.176184


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    53.027305   100.799345    84.460969    95.893741


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    59.391714   109.363463    98.158238    99.288761


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    75.497042   107.698116   117.083408    96.338124


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    88.160944   103.964701    98.373995    96.817576


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855    92.868065   101.864303    81.681863    97.394352


     Cash_GB2_HA  Cash_GB3_HA  Cash_LR2_HA  Cash_LR3_HA
855     98.87444   100.719312   101.120247    97.416719




## Тоже самое для итальянской лиги

In [442]:
def make_prediction_Italy_HA2(nu, step):
    Result2_HA_Kelli = pd.DataFrame()
    Result2_HA_Kelli["Cash_GB4_HA"] = test_prediction_HA_Kelli2(test4_HA, prediction_GB4_HA, proba_GB4_HA, test_Y4_HA, nu, step)
    Result2_HA_Kelli["Cash_LR4_HA"] = test_prediction_HA_Kelli2(test4_HA, prediction_LR4_HA, proba_LR4_HA, test_Y4_HA, nu, step)
    return Result2_HA_Kelli

for i in range(11,18):
    print(make_prediction_Italy_HA2(0.2, 0.68)[-1:])

     Cash_GB4_HA  Cash_LR4_HA
808    120.15281   110.822315
     Cash_GB4_HA  Cash_LR4_HA
808   108.380281    91.115006
     Cash_GB4_HA  Cash_LR4_HA
808   100.808836    94.250619
     Cash_GB4_HA  Cash_LR4_HA
808    109.26776    84.688091
     Cash_GB4_HA  Cash_LR4_HA
808   115.981203     81.77196
     Cash_GB4_HA  Cash_LR4_HA
808    111.26536     80.43428
     Cash_GB4_HA  Cash_LR4_HA
808    96.323382    77.171063


In [None]:
print(make_prediction_HA2(0.2, i/20)[-1:])

# Запись данных, для графиков

In [113]:
#     Result_Kelli["Cash_GB"] = test_prediction_Kelli2(test, prediction_GB, proba_GB, test_Y, nu, step)
#     Result_Kelli["Cash_GB2"] = test_prediction_Kelli2(test2, prediction_GB2, proba_GB2, test_Y2, nu, step)
#     Result_Kelli["Cash_GB3"] = test_prediction_Kelli2(test3, prediction_GB3, proba_GB3, test_Y3, nu, step)
#     Result_Kelli["Cash_LR"] = test_prediction_Kelli2(test, prediction_LR, proba_LR, test_Y, nu, step)
#     Result_Kelli["Cash_LR_fix"] = test_prediction_Kelli2(test, prediction_LR_fix, proba_LR_fix, test_Y, nu, step)
#     Result_Kelli["Cash_LR2"] = test_prediction_Kelli2(test2, prediction_LR2, proba_LR2, test_Y2, nu, step)
#     Result_Kelli["Cash_LR2_fix"] = test_prediction_Kelli2(test2, prediction_LR2_fix, proba_LR2_fix, test_Y2, nu, step)
#     Result_Kelli["Cash_LR3"] = test_prediction_Kelli2(test3, prediction_LR3, proba_LR3, test_Y3, nu, step)
#     Result_Kelli["Cash_LR3_fix"] = test_prediction_Kelli2(test3, prediction_LR3_fix, proba_LR_fix, test_Y3, nu, step)

#     Result_HA_Kelli["Cash_GB2_HA"] = test_prediction_HA_Kelli2(test2_HA, prediction_GB2_HA, proba_GB2_HA, test_Y2_HA, nu, step)
#     Result_HA_Kelli["Cash_GB3_HA"] = test_prediction_HA_Kelli2(test3_HA, prediction_GB3_HA, proba_GB3_HA, test_Y3_HA, nu, step)
#     Result_HA_Kelli["Cash_LR2_HA"] = test_prediction_HA_Kelli2(test2_HA, prediction_LR2_HA, proba_LR2_HA, test_Y2_HA, nu, step)
#     Result_HA_Kelli["Cash_LR3_HA"] = test_prediction_HA_Kelli2(test3_HA, prediction_LR3_HA, proba_LR3_HA, test_Y3_HA, nu, step)
#     accuracy_score(test_Y, prediction_GB)

IndentationError: unexpected indent (<ipython-input-113-af36c20c5708>, line 15)

In [322]:
score = pd.DataFrame([[score_GB, score_GB2, score_GB3, score_LR, score_LR_fix, score_LR2, score_LR2_fix, score_LR3, 
                      score_LR3_fix, score_GB2_HA, score_GB3_HA, score_GB4_HA, score_LR2_HA, score_LR3_HA, score_LR4_HA]], 
                     columns = ['GB', 'GB2', 'GB3', 'LR', 'LR_fix', 'LR2', 'LR2_fix', 'LR3', 'LR3_fix', 'GB2_HA', 'GB3_HA',
                                'GB4_HA', 'LR2_HA', 'LR3_HA', 'LR4_HA'])
score

Unnamed: 0,GB,GB2,GB3,LR,LR_fix,LR2,LR2_fix,LR3,LR3_fix,GB2_HA,GB3_HA,GB4_HA,LR2_HA,LR3_HA,LR4_HA
0,0.517544,0.524561,0.538596,0.514912,0.514912,0.52193,0.531579,0.535088,0.535965,0.704439,0.716121,0.732034,0.699766,0.709112,0.733252


In [323]:
score.to_csv("score.csv", index=False)

In [363]:
cash = pd.DataFrame()
for i in range(10, 18):
    cash[i/20] = make_prediction_HA2(0.1, i/20).ix[855]

cash.to_csv("cash.csv")

cash2 = pd.DataFrame()
for i in range(6,14):
    cash2[i/20] = make_prediction2(0.1, i/20).ix[1139]

cash2.to_csv("cash2.csv")    

In [443]:
make_prediction_Italy_HA2(0.2, 0.68).to_csv("italy.csv")    

In [105]:
train_X2[:10]

Unnamed: 0,Team1_FTHG_average,Team2_FTAG_average,FTG_nearest_diff,FTLG_nearest_diff,S_nearest_diff,ST_nearest_diff,C_nearest_diff,Team1_HFTR_nearest,Team2_AFTR_nearest,Team1_HFTR_nearest2,Team2_AFTR_nearest2,Team1_HFTR_nearest3,Team2_AFTR_nearest3,Referee_HomeTeam_points,Referee_AwayTeam_points,Referee_Draw_points,result
0,1.052632,1.131579,-0.513889,1.861111,-2.527778,-3.847222,0.416667,7.0,5.0,19.0,21.0,45.0,41.0,0.8,0.1,0.1,A
1,1.447368,0.894737,0.666667,0.916667,1.805556,2.319444,1.597222,5.0,4.0,22.0,12.0,53.0,28.0,0.5,0.5,0.0,A
2,1.434783,0.973684,0.277778,-0.152778,1.486111,0.055556,3.430556,11.0,3.0,33.0,9.0,59.0,25.0,0.4,0.4,0.2,D
3,1.447368,1.052632,0.5,-1.486111,4.458333,1.513889,5.041667,12.0,7.0,29.0,18.0,58.0,34.0,0.6,0.1,0.3,H
4,1.552632,1.078947,0.597222,0.069444,2.75,2.444444,2.041667,6.0,8.0,27.0,17.0,47.0,38.0,0.5,0.3,0.2,A
5,1.190476,1.184211,0.152778,-0.861111,3.111111,3.833333,2.486111,13.0,10.0,38.0,20.0,53.0,32.0,0.4,0.2,0.4,H
6,1.421053,0.947368,0.166667,0.486111,4.611111,1.819444,2.666667,9.0,7.0,22.0,20.0,43.0,25.0,0.4,0.3,0.3,A
7,2.394737,0.973684,1.875,0.236111,6.152778,4.25,2.222222,10.0,1.0,34.0,6.0,66.0,15.0,0.6,0.2,0.2,H
8,2.210526,0.928571,1.833333,-0.152778,3.819444,1.75,1.402778,9.0,7.0,33.0,23.0,74.0,38.0,0.5,0.3,0.2,H
9,2.184211,1.52381,1.444444,-0.083333,14.152778,8.291667,2.0,10.0,5.0,38.0,14.0,76.0,38.0,0.4,0.2,0.4,D


In [109]:
38/107

0.35514018691588783

In [114]:
pd.DataFrame([GB_classifier2.feature_importances_], columns = train_X2.columns[:-1])

Unnamed: 0,Team1_FTHG_average,Team2_FTAG_average,FTG_nearest_diff,FTLG_nearest_diff,S_nearest_diff,ST_nearest_diff,C_nearest_diff,Team1_HFTR_nearest,Team2_AFTR_nearest,Team1_HFTR_nearest2,Team2_AFTR_nearest2,Team1_HFTR_nearest3,Team2_AFTR_nearest3,Referee_HomeTeam_points,Referee_AwayTeam_points,Referee_Draw_points
0,0.053299,0.056274,0.025265,0.095068,0.080266,0.149117,0.028472,0.025056,0.008444,0.034373,0.002671,0.192861,0.208487,0.018238,0.013597,0.008513


In [233]:
pd.DataFrame(LR_classifier3.coef_).to_csv("www.csv", index=False)

In [215]:
1/(1+2.71828**(-sum(LR_classifier.coef_[2]*test_X.loc[2])))

0.62981382082587811

In [177]:
sum(LR_classifier.coef_[0]*test_X.loc[0])

-1.2578037994329243

In [217]:
train_X2.columns

Index(['Team1_FTHG_average', 'Team2_FTAG_average', 'FTG_nearest_diff',
       'FTLG_nearest_diff', 'S_nearest_diff', 'ST_nearest_diff',
       'C_nearest_diff', 'Team1_HFTR_nearest', 'Team2_AFTR_nearest',
       'Team1_HFTR_nearest2', 'Team2_AFTR_nearest2', 'Team1_HFTR_nearest3',
       'Team2_AFTR_nearest3', 'Referee_HomeTeam_points',
       'Referee_AwayTeam_points', 'Referee_Draw_points', 'result'],
      dtype='object')

In [209]:
result_LR[:10]

Unnamed: 0,A,D,H,prediction,result
0,0.208154,0.232641,0.559205,H,H
1,0.251064,0.261954,0.486981,H,D
2,0.249828,0.220604,0.529568,H,A
3,0.211766,0.257565,0.530669,H,A
4,0.249085,0.282688,0.468227,H,A
5,0.4021,0.262253,0.335647,A,D
6,0.437333,0.259172,0.303496,A,A
7,0.206188,0.235222,0.558589,H,H
8,0.47739,0.252529,0.270081,A,A
9,0.401924,0.233477,0.364599,A,A
