In [1]:
import numpy as np
import pandas as pd
import requests

import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from bs4 import BeautifulSoup as bs
import datetime
import time
from sklearn.model_selection import train_test_split

In [2]:
stat_list = {'win_loss_pct':168,
             'assist_turnover_ratio':474, 
             'assists_per_game':216, 
             'd_rebounds':859, 
             'fg_pct':148,
             'd_fg_pct':149,
             'ft_pct':150,
             'o_rebounds':857,
             'd_scoring':147,
             'o_scoring':145,
             'three_pct':152,
             'turnover_margin':519}

team_stats = {}

for stat in stat_list.keys():
    status_code = True
    num = 1
    while status_code == True:
        
        r = requests.get(f'https://www.ncaa.com/stats/basketball-men/d1/current/team/{stat_list[stat]}/p{num}')
        
        if r.status_code == 200:
            status_code = True
            soup = bs(r.content,'html.parser')
            for team in soup.table.find_all('tr')[1:]:
                if stat == 'win_loss_pct' and team.a.text != 'Northern Ariz.':
                    team_stats[team.a.text] = {}
                    team_stats[team.a.text][stat] = team.find_all('td')[4].text
                if stat in ['assist_turnover_ratio','fg_pct','d_fg_pct','ft_pct','three_pct','turnover_margin'] and team.a.text != 'Northern Ariz.':
                    team_stats[team.a.text][stat] = team.find_all('td')[5].text
                if stat in ['assists_per_game','d_rebounds','o_rebounds','d_scoring','o_scoring'] and team.a.text != 'Northern Ariz.':
                    team_stats[team.a.text][stat] = team.find_all('td')[4].text
                    
        else:
            status_code = False
        if not status_code:
            continue
        

        num += 1
        
        
           
df = pd.DataFrame.from_dict(team_stats,orient='index')
df.reset_index(inplace = True)

In [11]:
all_scores = []

all_dates = pd.date_range(datetime.date(2021,10,16), periods=150).tolist()
for date in all_dates:
    r = requests.get(f'https://www.ncaa.com/scoreboard/basketball-men/d1/{str(date.year)}/{str(date.month).zfill(2)}/{str(date.day).zfill(2)}/all-conf')
    r.content
    if r.status_code == 200:
        soup = bs(r.content,'html.parser')
        for game in soup.find_all(class_="gamePod-game-teams"):
            team_one = game.find_all(class_="gamePod-game-team-name")[0].text
            team_one_score = game.find_all(class_="gamePod-game-team-score")[0].text
            team_two = game.find_all(class_="gamePod-game-team-name")[1].text
            team_two_score = game.find_all(class_="gamePod-game-team-score")[1].text
            game_score = {'team_one':team_one, 'team_one_score':team_one_score,'team_two':team_two,'team_two_score':team_two_score, 'date':date}
            all_scores.append(game_score)
    print(date,status_code)

    

2021-10-16 00:00:00 False
2021-10-17 00:00:00 False
2021-10-18 00:00:00 False
2021-10-19 00:00:00 False
2021-10-20 00:00:00 False
2021-10-21 00:00:00 False
2021-10-22 00:00:00 False
2021-10-23 00:00:00 False
2021-10-24 00:00:00 False
2021-10-25 00:00:00 False
2021-10-26 00:00:00 False
2021-10-27 00:00:00 False
2021-10-28 00:00:00 False
2021-10-29 00:00:00 False
2021-10-30 00:00:00 False
2021-10-31 00:00:00 False
2021-11-01 00:00:00 False
2021-11-02 00:00:00 False
2021-11-03 00:00:00 False
2021-11-04 00:00:00 False
2021-11-05 00:00:00 False
2021-11-06 00:00:00 False
2021-11-07 00:00:00 False
2021-11-08 00:00:00 False
2021-11-09 00:00:00 False
2021-11-10 00:00:00 False
2021-11-11 00:00:00 False
2021-11-12 00:00:00 False
2021-11-13 00:00:00 False
2021-11-14 00:00:00 False
2021-11-15 00:00:00 False
2021-11-16 00:00:00 False
2021-11-17 00:00:00 False
2021-11-18 00:00:00 False
2021-11-19 00:00:00 False
2021-11-20 00:00:00 False
2021-11-21 00:00:00 False
2021-11-22 00:00:00 False
2021-11-23 0

In [12]:
scores_df = pd.DataFrame(all_scores)
scores_df = scores_df[scores_df['team_one_score']!='']

df.rename(columns={'index':'team_name'},inplace = True)

In [13]:
combined_df = pd.merge(scores_df,df,how='inner',left_on = 'team_one',right_on = 'team_name')
combined_df.rename(columns = {'team_name':'one_team_name',
                              'win_loss_pct':'one_win_loss_pct',
                              'assist_turnover_ratio':'one_assist_turnover_ratio',
                              'assists_per_game':'one_assists_per_game',                              
                              'd_rebounds':'one_d_rebounds',
                              'fg_pct':'one_fg_pct',
                              'd_fg_pct':'one_d_fg_pct',
                              'ft_pct':'one_ft_pct',
                              'o_rebounds':'one_o_rebounds',
                              'd_scoring':'one_d_scoring',
                              'o_scoring':'one_o_scoring',
                              'three_pct':'one_three_pct',
                              'turnover_margin':'one_turnover_margin'},inplace=True)
combined_df = pd.merge(combined_df,df,how='inner',left_on = 'team_two',right_on = 'team_name')
combined_df.rename(columns = {'team_name':'two_team_name',
                              'win_loss_pct':'two_win_loss_pct',
                              'assist_turnover_ratio':'two_assist_turnover_ratio',
                              'assists_per_game':'two_assists_per_game',                              
                              'd_rebounds':'two_d_rebounds',
                              'fg_pct':'two_fg_pct',
                              'd_fg_pct':'two_d_fg_pct',
                              'ft_pct':'two_ft_pct',
                              'o_rebounds':'two_o_rebounds',
                              'd_scoring':'two_d_scoring',
                              'o_scoring':'two_o_scoring',
                              'three_pct':'two_three_pct',
                              'turnover_margin':'two_turnover_margin'},inplace=True)

In [295]:
combined_df.dropna(inplace=True)
combined_df['o_rebound'] = combined_df['one_o_rebounds'].astype(float) - combined_df['two_o_rebounds'].astype(float)
combined_df['at_ratio'] = combined_df['one_assist_turnover_ratio'].astype(float) - combined_df['two_assist_turnover_ratio'].astype(float)
combined_df['assists'] = combined_df['one_assists_per_game'].astype(float) - combined_df['two_assists_per_game'].astype(float)
combined_df['d_rebound'] = combined_df['one_d_rebounds'].astype(float)- combined_df['two_d_rebounds'].astype(float)
combined_df['fg_pct'] = combined_df['one_fg_pct'].astype(float) - combined_df['two_fg_pct'].astype(float)
combined_df['d_fg_pct'] = combined_df['one_d_fg_pct'].astype(float) - combined_df['one_d_fg_pct'].astype(float)
combined_df['ft_pct'] = combined_df['one_ft_pct'].astype(float) - combined_df['two_ft_pct'].astype(float)
combined_df['d_scoring'] = combined_df['one_d_scoring'].astype(float) - combined_df['two_d_scoring'].astype(float)
combined_df['o_scoring'] = combined_df['one_o_scoring'].astype(float) - combined_df['two_o_scoring'].astype(float)
combined_df['three_pct'] = combined_df['one_three_pct'].astype(float) - combined_df['two_three_pct'].astype(float)

X = combined_df[['one_win_loss_pct', 'one_assist_turnover_ratio',
       'one_assists_per_game', 'one_d_rebounds', 'one_fg_pct', 'one_d_fg_pct',
       'one_ft_pct', 'one_o_rebounds', 'one_d_scoring', 'one_o_scoring',
       'one_three_pct','two_win_loss_pct', 'two_assist_turnover_ratio', 'two_assists_per_game',
       'two_d_rebounds', 'two_fg_pct', 'two_d_fg_pct', 'two_ft_pct',
       'two_o_rebounds', 'two_d_scoring', 'two_o_scoring', 'two_three_pct']]

X = combined_df[['one_d_rebounds', 'one_assist_turnover_ratio','one_fg_pct', 'one_d_fg_pct','one_ft_pct', 
                     'one_o_rebounds','one_o_scoring','one_d_scoring', 'one_three_pct',
                     'two_d_rebounds', 'two_assist_turnover_ratio','two_fg_pct', 'two_d_fg_pct', 'two_ft_pct',
                     'two_o_rebounds', 'two_o_scoring','two_d_scoring', 'two_three_pct']]

# X = combined_df[['one_d_rebounds', 'one_fg_pct', 'one_d_fg_pct', 
#                      'one_o_rebounds','one_three_pct',
#                      'two_d_rebounds','two_fg_pct', 'two_d_fg_pct', 
#                      'two_o_rebounds', 'two_three_pct']]

# X = combined_df[['o_rebound','at_ratio','assists','d_rebound','fg_pct','d_fg_pct','ft_pct','d_scoring','o_scoring','three_pct']]

y = combined_df[['team_one_score','team_two_score']]

In [296]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
regr = RandomForestRegressor(n_estimators=200,min_samples_leaf=10)
regr.fit(x_train,y_train)
regr_preds = regr.predict(x_test)

rf_y_train = y_train['team_one_score'] > y_train['team_two_score']
rf_y_test = y_test['team_one_score'] > y_test['team_two_score']

rf = RandomForestClassifier(n_estimators=200,min_samples_leaf=10)
rf.fit(x_train,rf_y_train)
rf_preds = rf.predict(x_test)

In [297]:
regr_prediction_df = y_test.copy()
regr_prediction_df['team_one_pred'] = regr_preds[:,0]
regr_prediction_df['team_two_pred'] = regr_preds[:,1]
scores_and_preds = scores_df.copy()
scores_and_preds['team_one_pred'] = regr_prediction_df['team_one_pred']
scores_and_preds['team_two_pred'] = regr_prediction_df['team_two_pred']
scores_and_preds = scores_and_preds[scores_and_preds['team_one_pred'].notnull()]
def winner(row):
    if int(row['team_one_score']) > int(row['team_two_score']):
        return row['team_one']
    else:
        return row['team_two']
scores_and_preds['winner'] = scores_and_preds.apply(winner,axis=1)
def pred_winner(row):
    if int(row['team_one_pred']) > int(row['team_two_pred']):
        return row['team_one']
    else:
        return row['team_two']
scores_and_preds['pred_winner'] = scores_and_preds.apply(pred_winner,axis=1)


In [298]:
len(scores_and_preds[scores_and_preds['winner']==scores_and_preds['pred_winner']])/len(scores_and_preds)

0.5845117845117845

In [299]:
print(confusion_matrix(rf_y_test,rf_preds))
print(classification_report(rf_y_test,rf_preds))

[[769 156]
 [286 291]]
              precision    recall  f1-score   support

       False       0.73      0.83      0.78       925
        True       0.65      0.50      0.57       577

    accuracy                           0.71      1502
   macro avg       0.69      0.67      0.67      1502
weighted avg       0.70      0.71      0.70      1502



In [288]:
date = datetime.date.today()
new_games = []
r = requests.get(f'https://www.ncaa.com/scoreboard/basketball-men/d1/{str(date.year)}/{str(date.month).zfill(2)}/{str(date.day).zfill(2)}/all-conf')
r.content
soup = bs(r.content,'html.parser')
for game in soup.find_all(class_="gamePod-game-teams"):
    team_one = game.find_all(class_="gamePod-game-team-name")[0].text
    team_two = game.find_all(class_="gamePod-game-team-name")[1].text
    new_game = {'team_one':team_one, 'team_two':team_two,'date':date}
    new_games.append(new_game)
new_games_df = pd.DataFrame(new_games)

In [289]:
new_combined_df = pd.merge(new_games_df,df,how='inner',left_on = 'team_one',right_on = 'team_name')
new_combined_df.rename(columns = {'team_name':'one_team_name',
                              'win_loss_pct':'one_win_loss_pct',
                              'assist_turnover_ratio':'one_assist_turnover_ratio',
                              'assists_per_game':'one_assists_per_game',                              
                              'd_rebounds':'one_d_rebounds',
                              'fg_pct':'one_fg_pct',
                              'd_fg_pct':'one_d_fg_pct',
                              'ft_pct':'one_ft_pct',
                              'o_rebounds':'one_o_rebounds',
                              'd_scoring':'one_d_scoring',
                              'o_scoring':'one_o_scoring',
                              'three_pct':'one_three_pct',
                              'turnover_margin':'one_turnover_margin'},inplace=True)
new_combined_df = pd.merge(new_combined_df,df,how='inner',left_on = 'team_two',right_on = 'team_name')
new_combined_df.rename(columns = {'team_name':'two_team_name',
                              'win_loss_pct':'two_win_loss_pct',
                              'assist_turnover_ratio':'two_assist_turnover_ratio',
                              'assists_per_game':'two_assists_per_game',                              
                              'd_rebounds':'two_d_rebounds',
                              'fg_pct':'two_fg_pct',
                              'd_fg_pct':'two_d_fg_pct',
                              'ft_pct':'two_ft_pct',
                              'o_rebounds':'two_o_rebounds',
                              'd_scoring':'two_d_scoring',
                              'o_scoring':'two_o_scoring',
                              'three_pct':'two_three_pct',
                              'turnover_margin':'two_turnover_margin'},inplace=True)
new_combined_df.dropna(inplace=True)

In [462]:
new_x = new_combined_df[['one_win_loss_pct', 'one_assist_turnover_ratio',
       'one_assists_per_game', 'one_d_rebounds', 'one_fg_pct', 'one_d_fg_pct',
       'one_ft_pct', 'one_o_rebounds', 'one_d_scoring', 'one_o_scoring',
       'one_three_pct', 'two_win_loss_pct', 'two_assist_turnover_ratio', 'two_assists_per_game',
       'two_d_rebounds', 'two_fg_pct', 'two_d_fg_pct', 'two_ft_pct',
       'two_o_rebounds', 'two_d_scoring', 'two_o_scoring', 'two_three_pct']].dropna()
new_preds = rf.predict(new_x)
new_regr_preds = regr.predict(new_x)
new_combined_df['team_one_win'] = new_preds
new_combined_df['team_one_score'] = new_regr_preds[:,0]
new_combined_df['team_two_score'] = new_regr_preds[:,1]
def new_winner(row):
    if row['team_one_win'] == True:
        return row['team_one']
    else:
        return row['team_two']
new_combined_df['pred_winner'] = new_combined_df.apply(new_winner, axis=1)
new_combined_df[['team_one','team_two','one_win_loss_pct','two_win_loss_pct','team_one_score','team_two_score','pred_winner']]

Unnamed: 0,team_one,team_two,one_win_loss_pct,two_win_loss_pct,team_one_score,team_two_score,pred_winner
0,Charleston So.,UNC Asheville,17.2,55.2,66.861567,80.125652,UNC Asheville
1,Hampton,High Point,33.3,43.3,61.719854,69.738265,High Point
2,Xavier,St. John's (NY),60.7,53.6,73.313563,76.033265,St. John's (NY)
3,SIUE,Tennessee St.,35.5,43.3,65.879545,75.925666,Tennessee St.
4,Saint Francis (PA),Wagner,31.0,79.2,61.370382,78.279485,Wagner
5,Oklahoma St.,Iowa St.,46.4,69.0,62.562604,68.438025,Iowa St.
6,St. Francis Brooklyn,Mount St. Mary's,34.5,46.4,61.646268,68.356616,Mount St. Mary's
7,Sacred Heart,LIU,34.5,53.6,70.410475,82.049264,LIU
8,Georgia Tech,Clemson,37.9,48.3,66.19445,73.383342,Clemson
9,Saint Joseph's,La Salle,37.0,30.8,70.467331,71.656528,Saint Joseph's


In [300]:
import pickle
pickle.dump(rf, open('ncaa_rfc', 'wb'))
pickle.dump(regr, open('ncaa_rfr', 'wb'))