In [1]:
import pandas as pd
import math
import csv
import random
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

In [2]:

base_elo = 1600
team_elos = {} 
team_stats = {}
X = []
y = []

folder = 'data' 

In [3]:
def initialize_data(Mstat, Ostat, Tstat):
    new_Mstat = Mstat.drop(['Rk', 'Arena'], axis=1)
    new_Ostat = Ostat.drop(['Rk', 'G', 'MP'], axis=1)
    new_Tstat = Tstat.drop(['Rk', 'G', 'MP'], axis=1)

    team_stats1 = pd.merge(new_Mstat, new_Ostat, how='left', on='Team')
    team_stats1 = pd.merge(team_stats1, new_Tstat, how='left', on='Team')
    return team_stats1.set_index('Team', inplace=False, drop=True)

In [4]:
def get_elo(team):
    try:
        return team_elos[team]
    except:
        
        team_elos[team] = base_elo
        return team_elos[team]

In [5]:

def calc_elo(win_team, lose_team):
    winner_rank = get_elo(win_team)
    loser_rank = get_elo(lose_team)

    rank_diff = winner_rank - loser_rank
    exp = (rank_diff  * -1) / 400
    odds = 1 / (1 + math.pow(10, exp))
    
    if winner_rank < 2100:
        k = 32
    elif winner_rank >= 2100 and winner_rank < 2400:
        k = 24
    else:
        k = 16

        
    new_winner_rank = round(winner_rank + (k * (1 - odds)))      
    new_loser_rank = round(loser_rank + (k * (0 - odds)))
    return new_winner_rank, new_loser_rank

In [6]:
def  build_dataSet(all_data):
    print("Building data set..")
    X = []
    skip = 0
    for index, row in all_data.iterrows():

        Wteam = row['WTeam']
        Lteam = row['LTeam']

        team1_elo = get_elo(Wteam)
        team2_elo = get_elo(Lteam)

        if row['WLoc'] == 'H':
            team1_elo += 100
        else:
            team2_elo += 100

        team1_features = [team1_elo]
        team2_features = [team2_elo]

        for key, value in team_stats.loc[Wteam].iteritems():
            team1_features.append(value)
        for key, value in team_stats.loc[Lteam].iteritems():
            team2_features.append(value)

        if random.random() > 0.5:
            X.append(team1_features + team2_features)
            y.append(0)
        else:
            X.append(team2_features + team1_features)
            y.append(1)

        if skip == 0:
            print('X',X)
            skip = 1

        new_winner_rank, new_loser_rank = calc_elo(Wteam, Lteam)
        team_elos[Wteam] = new_winner_rank
        team_elos[Lteam] = new_loser_rank

    return np.nan_to_num(X), y

In [7]:
if __name__ == '__main__':

    Mstat = pd.read_csv(folder + '/15-16Miscellaneous_Stat.csv')
    Ostat = pd.read_csv(folder + '/15-16Opponent_Per_Game_Stat.csv')
    Tstat = pd.read_csv(folder + '/15-16Team_Per_Game_Stat.csv')

    team_stats = initialize_data(Mstat, Ostat, Tstat)

    result_data = pd.read_csv(folder + '/2015-2016_result.csv')
    X, y = build_dataSet(result_data)

    # 训练网络模型
    print("Fitting on %d game samples.." % len(X))

    model = linear_model.LogisticRegression()
    model.fit(X, y)

    # 利用10折交叉验证计算训练正确率
    print("Doing cross-validation..")
    print(cross_val_score(model, X, y, cv = 10, scoring='accuracy', n_jobs=-1).mean())

Building data set..
X [[1700, 27.2, 32.0, 50.0, 33.0, 49.0, -2.73, 0.0, -2.74, 104.6, 107.6, 93.4, 0.255, 0.256, 0.527, 0.483, 12.6, 23.7, 0.205, 0.48700000000000004, 10.5, 75.8, 0.204, 812292.0, 38.0, 85.8, 0.44299999999999995, 7.6, 22.4, 0.341, 30.3, 63.4, 0.479, 17.5, 23.2, 0.754, 10.9, 33.3, 44.2, 20.8, 7.2, 4.2, 11.3, 18.5, 101.1, 36.9, 84.0, 0.439, 7.4, 21.5, 0.34600000000000003, 29.4, 62.5, 0.47100000000000003, 17.2, 21.4, 0.805, 10.4, 34.0, 44.4, 20.5, 5.7, 5.7, 13.4, 19.7, 98.4, 1600, 28.2, 48.0, 34.0, 51.0, 31.0, 3.61, -0.12, 3.49, 105.1, 101.4, 97.1, 0.237, 0.336, 0.552, 0.516, 13.8, 19.1, 0.185, 0.48, 14.4, 74.6, 0.19399999999999998, 690150.0, 37.1, 86.1, 0.43200000000000005, 8.3, 24.5, 0.33799999999999997, 28.9, 61.6, 0.469, 16.7, 22.1, 0.755, 11.5, 35.0, 46.5, 22.0, 8.6, 5.0, 16.1, 18.3, 99.2, 38.6, 84.4, 0.45799999999999996, 9.9, 28.4, 0.35, 28.7, 56.1, 0.512, 15.6, 20.0, 0.7829999999999999, 8.3, 33.8, 42.1, 25.6, 9.1, 5.9, 15.0, 19.1, 102.8]]
Fitting on 1316 game sample



0.6922726229174603


In [8]:
def predict_winner(team_1, team_2, model):
    features = []

    
    features.append(get_elo(team_1))
    for key, value in team_stats.loc[team_1].iteritems():
        features.append(value)

    features.append(get_elo(team_2) + 100)
    for key, value in team_stats.loc[team_2].iteritems():
        features.append(value)

    features = np.nan_to_num(features)
    return model.predict_proba([features])

In [9]:
print('Predicting on new schedule..')
schedule1617 = pd.read_csv(folder + '/16-17Schedule.csv')
result = []
for index, row in schedule1617.iterrows():
    team1 = row['Vteam']
    team2 = row['Hteam']
    pred = predict_winner(team1, team2, model)
    prob = pred[0][0]
    if prob > 0.5:
        winner = team1
        loser = team2
        result.append([winner, loser, prob])
    else:
        winner = team2
        loser = team1
        result.append([winner, loser, 1 - prob])

with open('16-17Result.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['win', 'lose', 'probability'])
    writer.writerows(result)
    print('done.')

Predicting on new schedule..
done.


In [10]:
pd.read_csv('16-17Result.csv',header=0)

Unnamed: 0,win,lose,probability
0,Cleveland Cavaliers,New York Knicks,0.841085
1,Golden State Warriors,San Antonio Spurs,0.536234
2,Portland Trail Blazers,Utah Jazz,0.537973
3,Boston Celtics,Brooklyn Nets,0.888189
4,Indiana Pacers,Dallas Mavericks,0.566591
5,Houston Rockets,Los Angeles Lakers,0.860980
6,Memphis Grizzlies,Minnesota Timberwolves,0.608598
7,Charlotte Hornets,Milwaukee Bucks,0.736217
8,New Orleans Pelicans,Denver Nuggets,0.503045
9,Miami Heat,Orlando Magic,0.727536
