In [None]:
import requests
import bs4
import math
import time
from bs4 import BeautifulSoup
import os, pickle
import pandas as pd
import numpy as np
os.chdir('C:/omnivest/simulation')
import csv

url_start='http://www.basketball-reference.com/leagues/NBA_'
url_end='.html'

# function for scraping the data from the url

def get_table(year,month):
    temp=[]
    url_var=str(year)+'_games-'+month
    url=url_start+url_var+url_end
    r=requests.get(url)
    soup=BeautifulSoup(r.content, 'lxml')
    table = soup.find('table', attrs={'class':'suppress_glossary sortable stats_table'})
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        temp2=[ele for ele in cols if cols[2] and ele]
        #excluding unfinished games
        if temp2:
            temp.append([year,month]+temp2)
    return temp


# reading the data from 2015-2017 to data

seasons=[2015,2016,2017]
months=['october', 'november', 'december', 'january', 'february',
       'march', 'april', 'may','june']
data=[]
for i in range(len(seasons)):
    for j in range(len(months)):
        try:
            temps=get_table(seasons[i], months[j])
            #exclude empty months
            if temps:
                data=data+temps
        except:
            print(str(seasons[i])+''+months[j]+' not included')

            
# writing data into csv

with open('game_data.csv','w',newline='') as f:
    writer=csv.writer(f,delimiter=',')
    writer.writerow(['year', 'month', 'away_tm','away_score','home_tm','home_score'])
    for item in data:
        #Write item to outcsv
        writer.writerow([item[0], item[1], item[3],item[4],item[5],item[6]])
        
        
# read into dataframe
game_data=pd.read_csv('game_data.csv')

# get the difference & sum data for further calculation

game_data['diff']=game_data['away_score']-game_data['home_score']
game_data['sum']=game_data['away_score']+game_data['home_score']

# get the list of NBA teams

teams=game_data['home_tm']
teams=teams.drop_duplicates()
teams=teams.values.tolist()

teams_df=pd.DataFrame(data={'name':teams})


# create the battle form (get home team & away team)
temp = []
temp2 = []
for item in teams:
    for item2 in teams:
        if item != item2:
            temp.append(item)
            temp2.append(item2)
games = pd.DataFrame(data={'a_tm':temp, 'h_tm':temp2})

temp3= []
games['Battle'] = games['a_tm']+' vs '+games['h_tm']
games = games.set_index('Battle')
games.index.name=None



# simulation

class Team:
    def __init__(self, name, data):
        self.name=name
        self.data=data
        
        #the lines that include team=name as the away/home team
        self.a_games=data.loc[data['away_tm']==name]
        self.h_games=data.loc[data['home_tm']==name]
        
        #average sum of scores for all games
        self.lmda=data['sum'].mean()
        self.sumvar=data['sum'].var()
        self.diffvar=data['diff'].var()
        
        
    #the average sum of scores scored and conceded by team=name
    def sscore(self):
        tp1=self.a_games['sum'].values.tolist()
        tp1.extend(self.h_games['sum'].values.tolist())
        tp2=np.mean(tp1)
        b_n=1/(1+3/(len(tp1)*self.sumvar))
        tp3=self.lmda+b_n*(tp2-self.lmda)
        return tp3
    
    
    #the average (scored-conceded) per game for team=name
    def difscore(self):
        tp1=self.a_games['diff'].values.tolist()
        tp1.extend([-x for x in self.h_games['diff'].values.tolist()])
        tp2=np.mean(tp1)
        a_n=1/(1+3/(len(tp1)*self.diffvar))
        tp3=a_n*tp2
        return tp3
    
    
c_home=-game_data['diff'].mean()
lmda=game_data['sum'].mean()

# \sigma^2_(\delta G)
diffvar=game_data['diff'].var()

# variance of q_ij
vqij=2*diffvar

# simulation paths
N=1000


#the \Delta G_i for each team
diff_li=[]
for row in teams_df['name']:
    temp1=Team(row,game_data)
    diff_li.append(temp1.difscore())
teams_df['dlt_g']=diff_li


#the \Sigma G_i for each team
sum_li=[]
for row in teams_df['name']:
    temp1=Team(row,game_data)
    sum_li.append(temp1.sscore())
teams_df['sigma_g']=sum_li
teams_df = teams_df.set_index('name')
teams_df.index.name=None

#game related functions
#tm1 is away team, tm2 is home team

class Game:
    def __init__(self, tm1, tm2):
        self.tm1=tm1
        self.tm2=tm2
        #value of sum of goals g_i+g_j
        self.sgoals=teams_df.ix[self.tm1,'sigma_g']+teams_df.ix[self.tm2,'sigma_g']-lmda
        #mean value of q_ij considering home advantage
        #attention i is away team here
        self.mqij=teams_df.ix[self.tm1,'dlt_g']-teams_df.ix[self.tm2,'dlt_g']-c_home
        self.times=N
        
    #monte carlo simulation for poisson
    def sim_result(self):
        temp=0
        game_mqij=games['mqij'].loc[self.tm1+' vs '+self.tm2]
        game_sgoals=games['sgoals'].loc[self.tm1+' vs '+self.tm2]
        a_mean=(game_mqij+game_sgoals)/2
        h_mean=(game_sgoals-game_mqij)/2
        for i in range(self.times):
            t=0 
            I=0
            while t<=1:
                t=t-1/(a_mean+h_mean)*math.log(np.random.random_sample())
                I=I+1
            temp=temp+I
        g_i=round(temp/self.times*a_mean/(a_mean+h_mean))
        g_j=round(temp/self.times*h_mean/(a_mean+h_mean))
        return g_i, g_j
    
    
#the time needed is 354 sec
#Get for each game the simulation result

def all_sim():
    t1=time.time()
    tp_1=[]
    tp_2=[]
    for index, row in games.iterrows():
        one_game=Game(row['a_tm'],row['h_tm'])
        rlt1, rlt2=one_game.sim_result()
        tp_1.append(rlt1)
        tp_2.append(rlt2)
    games['sim_ascore']=tp_1
    games['sim_hscore']=tp_2
    t2=time.time()-t1
    print('Time used for this simulation is '+str(t2)+' seconds')
    return 

all_sim()
games.to_csv("simulation_result.csv")

