In [22]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json


# Get matchups and prediction inputs

In [23]:
def get_matchups():
    """
    Retrieves all the matchups from ESPN website for next day games.
    The next day logic is implemented by checking if a gameday is found,
    if found, the for loop breaks.

    :return: a list containing matchup strings
    :rtype: list
    """
    matchup_list = []
    game_date = ''
    game_matchup = ''
    url = 'https://www.espn.com.sg/nba/fixtures'
    r = requests.get(url)
    soup = BeautifulSoup(r.text,parser='html.parser',features="lxml")
    game_containers = soup.findAll('table', {'class':'schedule has-team-logos align-left'})
    counter = 0
    for game in game_containers:
        try:
            if 'time' in game.thead.text:
                game_matchup = game.tbody
                game_date = soup.findAll('div', {'id':'sched-container'})[0].findAll('h2')[counter].text
        except AttributeError:
            continue
        counter += 1

        if game_date != '':
            break
    if game_matchup == '':
        game_date = 'No upcoming games.'
        return matchup_list,game_date
        
    teams_playing = game_matchup.findAll('a', {'class':'team-name'})

    # Not needed for our web app, but just filling it in here incase we need it
    time_playing = game_matchup.findAll('td', {'data-behavior':'date_time'})

    error_name = {
            "GS":"GSW",
            "SA":"SAS",
            "WSH":"WAS",
            "NO":"NOP",
            "UTAH":"UTA",
            "NY":"NYK"
        }

    for i in range(0,len(teams_playing),2):
        away = teams_playing[i].text.split()[-1]
        home = teams_playing[i+1].text.split()[-1]
        if away in error_name:
            away = error_name[away]
        if home in error_name:
            home = error_name[home]
        matchup_string = '{} (away) vs. {} (home)'.format(away, home)
        matchup_list.append(matchup_string)
    return matchup_list, game_date

def get_team_stats(home_team, away_team):
    """
    Retrieves both teams stats from "team_stats.json".
    Calculate disparities between teams stats.

    :return: match prediction inputs
    :rtype: df
    """
    with open("../data/team_stats.json", "r") as jsonFile:
        data = json.load(jsonFile)

    df = pd.DataFrame()
    
    df["HOME_TEAM"] = [home_team]
    df["AVG_PTS_x"] = [data[home_team]["AVG_PTS"]]
    df["AVG_AST_x"] = [data[home_team]["AVG_AST"]]
    df["AVG_OREB_x"] = [data[home_team]["AVG_OREB"]]
    df["AVG_DREB_x"] = [data[home_team]["AVG_DREB"]]
    df["OFFRATE_x"] = [data[home_team]["OFFRATE"]]
    df["DEFRATE_x"] = [data[home_team]["DEFRATE"]]
    df["ELO_x"] = [data[home_team]["ELO"]]
    
    df["AWAY_TEAM"] = [away_team]
    df["AVG_PTS_y"] = [data[away_team]["AVG_PTS"]]
    df["AVG_AST_y"] = [data[away_team]["AVG_AST"]]
    df["AVG_OREB_y"] = [data[away_team]["AVG_OREB"]]
    df["AVG_DREB_y"] = [data[away_team]["AVG_DREB"]]
    df["OFFRATE_y"] = [data[away_team]["OFFRATE"]]
    df["DEFRATE_y"] = [data[away_team]["DEFRATE"]]
    df["ELO_y"] = [data[away_team]["ELO"]]
    
    df["DIS_PTS"] = [df["AVG_PTS_x"][0] - df["AVG_PTS_y"][0]]
    df["DIS_AST"] = [df["AVG_AST_x"][0] - df["AVG_AST_y"][0]]
    df["DIS_OREB"] = [df["AVG_OREB_x"][0] - df["AVG_OREB_y"][0]]
    df["DIS_DREB"] = [df["AVG_DREB_x"][0] - df["AVG_DREB_y"][0]]
    df["DIS_OFFRATE"] = [df["OFFRATE_x"][0] - df["OFFRATE_y"][0]]
    df["DIS_DEFRATE"] = [df["DEFRATE_x"][0] - df["DEFRATE_y"][0]]
    df["DIS_ELO"] = [df["ELO_x"][0] - df["ELO_y"][0]]
    
    return df

In [24]:
matchups, game_date = get_matchups()
matchups, game_date

(['DEN (away) vs. CHA (home)',
  'MIN (away) vs. DET (home)',
  'MIA (away) vs. BOS (home)',
  'LAC (away) vs. TOR (home)',
  'PHI (away) vs. IND (home)',
  'BKN (away) vs. CHI (home)',
  'DAL (away) vs. MEM (home)',
  'ORL (away) vs. MIL (home)',
  'PHX (away) vs. GSW (home)',
  'OKC (away) vs. SAC (home)',
  'NYK (away) vs. LAL (home)'],
 'Tuesday, May 11')

In [32]:
game = matchups[2]
away = game[0:3]
home = game[15:18]
game_df = get_team_stats(home, away)
game_df

Unnamed: 0,HOME_TEAM,AVG_PTS_x,AVG_AST_x,AVG_OREB_x,AVG_DREB_x,OFFRATE_x,DEFRATE_x,ELO_x,AWAY_TEAM,AVG_PTS_y,...,OFFRATE_y,DEFRATE_y,ELO_y,DIS_PTS,DIS_AST,DIS_OREB,DIS_DREB,DIS_OFFRATE,DIS_DEFRATE,DIS_ELO
0,BOS,112.911765,23.5,10.75,33.632353,1.130314,1.077317,1504.243538,MIA,107.617647,...,1.103837,1.158139,1549.217035,5.294118,-2.779412,2.764706,0.132353,0.026477,-0.080822,-44.973497


# Train model and make prediction

In [26]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score,roc_curve,auc,recall_score,f1_score,precision_score,classification_report,confusion_matrix,auc

In [27]:
df = pd.read_csv("../data/season_history.csv")
df

Unnamed: 0,TEAM_ABBREVIATION_x,TEAM_NAME_x,GAME_ID,GAME_DATE_x,WL_x,MIN_x,PTS_x,FGM_x,FGA_x,FG_PCT_x,...,AVG_OREB_x,AVG_OREB_y,AVG_DREB_x,AVG_DREB_y,OFFRATE_x,OFFRATE_y,DEFRATE_x,DEFRATE_y,ELO_x,ELO_y
0,BOS,Boston Celtics,22000007,2020-12-25,0,241,95,37,98,0.378,...,10.000000,13.000000,27.000000,44.000000,1.203156,1.118068,1.193294,0.885510,1510.000000,1510.000000
1,MIA,Miami Heat,22000005,2020-12-25,1,237,111,38,75,0.507,...,7.000000,8.000000,36.000000,37.000000,1.013258,1.086538,1.070076,0.951923,1490.000000,1510.000000
2,MIL,Milwaukee Bucks,22000006,2020-12-25,1,241,138,50,90,0.556,...,11.000000,13.000000,41.000000,34.000000,1.183953,0.874558,1.193738,1.104240,1490.000000,1490.000000
3,LAL,Los Angeles Lakers,22000008,2020-12-25,1,237,138,51,91,0.560,...,8.000000,6.000000,37.000000,33.000000,1.044061,1.015936,1.111111,1.055777,1490.000000,1490.000000
4,DEN,Denver Nuggets,22000009,2020-12-25,0,238,108,37,81,0.457,...,10.000000,11.000000,36.000000,29.000000,1.083481,1.098485,1.101243,1.032197,1490.000000,1510.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,MEM,Memphis Grizzlies,22001028,2021-05-10,1,239,115,45,91,0.495,...,11.194030,11.764706,35.328358,35.602941,1.109989,1.131152,1.110721,1.079907,1506.687764,1488.040762
1005,SAS,San Antonio Spurs,22001029,2021-05-10,1,239,146,56,94,0.596,...,9.208955,10.313433,34.552239,37.865672,1.097622,1.159508,1.164540,1.046213,1469.758463,1602.551441
1006,CLE,Cleveland Cavaliers,22001027,2021-05-10,0,241,102,33,93,0.355,...,10.455882,8.970149,31.955882,33.656716,1.052333,1.110935,1.137916,1.140886,1340.976366,1461.999074
1007,POR,Portland Trail Blazers,22001031,2021-05-10,1,240,140,47,92,0.511,...,10.779412,9.367647,33.882353,33.250000,1.157820,1.058555,1.091289,1.140911,1548.752937,1310.913330


In [28]:
features_list = ['DIS_ELO', 'DIS_OFFRATE', 'DIS_DEFRATE', 'DIS_PTS', 'DIS_AST', 'DIS_OREB', 'DIS_DREB']
target = 'WL_x'

# Creating our independent and dependent variables
x = df[features_list]
y = df['PLUS_MINUS_x']

model = sm.OLS(y,x)
results = model.fit()

features_list = []
for i in range(len(x.keys())):
    if results.pvalues[i] <= 0.05:
        features_list.append(model.exog_names[i])
        
df, features_list

(     TEAM_ABBREVIATION_x             TEAM_NAME_x   GAME_ID GAME_DATE_x  WL_x  \
 0                    BOS          Boston Celtics  22000007  2020-12-25     0   
 1                    MIA              Miami Heat  22000005  2020-12-25     1   
 2                    MIL         Milwaukee Bucks  22000006  2020-12-25     1   
 3                    LAL      Los Angeles Lakers  22000008  2020-12-25     1   
 4                    DEN          Denver Nuggets  22000009  2020-12-25     0   
 ...                  ...                     ...       ...         ...   ...   
 1004                 MEM       Memphis Grizzlies  22001028  2021-05-10     1   
 1005                 SAS       San Antonio Spurs  22001029  2021-05-10     1   
 1006                 CLE     Cleveland Cavaliers  22001027  2021-05-10     0   
 1007                 POR  Portland Trail Blazers  22001031  2021-05-10     1   
 1008                 ATL           Atlanta Hawks  22001026  2021-05-10     1   
 
       MIN_x  PTS_x  FGM_x

In [34]:
models_dict = {
        'Linear Regression': LinearRegression(),
        'Logistic Regression':LogisticRegression(),
        'Naive Bayes':GaussianNB(),
        # 'Decision Trees':DecisionTreeClassifier(),
        'SVM linear': svm.SVC(kernel='linear'),
        'SVM rbf': svm.SVC(kernel='rbf'),
        # 'Random Forest': RandomForestClassifier(n_estimators = 100),
        # 'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
    }
    
prediction_data = {} # store prediction for each model 

for model_name in models_dict:
    X_train = df[features_list]
    X_test = game_df[features_list]
    y_train = df['WL_x']
    
    m = models_dict[model_name]

    if model_name == 'Linear Regression':
        y_train = df['PLUS_MINUS_x']
    
    m.fit(X_train, y_train)
    prediction = m.predict(X_test)

    if model_name == 'Linear Regression':
        if prediction[0] > 0:
            prediction[0] = 1
        else:
            prediction[0] = 0
                    
    prediction_data[model_name] = prediction[0]
    
    print(model_name + ':', prediction[0])

final_prediction = 0
for k, v in prediction_data.items():
    final_prediction += v

final_prediction = final_prediction / 5
print('Average outcome score:', final_prediction)
print('Predicted Outcome:', round(final_prediction))

Linear Regression: 1.0
Logistic Regression: 0
Naive Bayes: 1
SVM linear: 0
SVM rbf: 0
Average outcome score: 0.4
Predicted Outcome: 0
