In [37]:
# import packages
import os
import tarfile
import urllib
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

TEAM_PATH = './resources/cleanTeams/michiganStateClean.csv'

In [6]:
def load_data(path=TEAM_PATH):
    return pd.read_csv(path,header=0)
load_data()

Unnamed: 0,G,Date,Court,Opp,W/L,Tm,Opp.1,FG,FGA,FG%,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,11/7/2022,0,Northern Arizona,W,73,55,29,65,0.446,...,6,8,0.75,10,42,17,6,6,9,16
1,2,11/11/2022,N,Gonzaga,L,63,64,21,52,0.404,...,18,28,0.643,6,28,11,9,1,15,22
2,3,11/15/2022,N,Kentucky,W (2 OT),86,77,28,69,0.406,...,23,27,0.852,12,40,19,6,3,18,19
3,4,11/18/2022,0,Villanova,W,73,71,28,54,0.519,...,4,6,0.667,5,29,22,4,1,12,16
4,5,11/24/2022,N,Alabama,L,70,81,24,62,0.387,...,18,25,0.72,10,36,9,5,5,11,22
5,6,11/26/2022,N,Oregon,W,74,70,27,61,0.443,...,11,12,0.917,10,32,20,2,1,7,12
6,7,11/27/2022,@,Portland,W,78,77,31,59,0.525,...,8,11,0.727,9,27,17,1,3,8,14
7,8,11/30/2022,@,Notre Dame,L,52,70,21,54,0.389,...,6,6,1.0,2,28,8,4,2,7,12
8,9,12/4/2022,0,Northwestern,L,63,70,23,55,0.418,...,9,12,0.75,6,27,16,7,4,13,17
9,10,12/7/2022,@,Penn State,W,67,58,26,60,0.433,...,9,11,0.818,4,38,13,2,3,8,14


In [7]:
teamData = load_data()
teamData.head()

Unnamed: 0,G,Date,Court,Opp,W/L,Tm,Opp.1,FG,FGA,FG%,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,11/7/2022,0,Northern Arizona,W,73,55,29,65,0.446,...,6,8,0.75,10,42,17,6,6,9,16
1,2,11/11/2022,N,Gonzaga,L,63,64,21,52,0.404,...,18,28,0.643,6,28,11,9,1,15,22
2,3,11/15/2022,N,Kentucky,W (2 OT),86,77,28,69,0.406,...,23,27,0.852,12,40,19,6,3,18,19
3,4,11/18/2022,0,Villanova,W,73,71,28,54,0.519,...,4,6,0.667,5,29,22,4,1,12,16
4,5,11/24/2022,N,Alabama,L,70,81,24,62,0.387,...,18,25,0.72,10,36,9,5,5,11,22


In [35]:
features = teamData.drop("Tm",axis=1)
features.drop(columns=['Opp','W/L','G','Date','Court'],inplace=True)

labels = teamData["Tm"].copy()

In [38]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('std_scaler', StandardScaler())
])
featuresScaled = num_pipeline.fit_transform(features)

train_features, test_features, train_labels, test_labels = train_test_split(
    featuresScaled, labels, test_size=0.2, random_state=42
)

In [39]:
# Fit the RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(train_features, train_labels)

# Make predictions on the test set
predictions = rf_reg.predict(test_features)

In [42]:

print("Predictions:", predictions)
print("Labels:", list(test_labels))

mse = mean_squared_error(test_labels, predictions)
rmse = np.sqrt(mse)
print("MSE:", rmse)


Predictions: [72.01 73.87 66.17 63.89 66.15]
Labels: [73, 74, 63, 63, 69]
MSE: 1.9980240238795912


In [45]:
pred1 = [[70,23,55,0.418,8,22,0.364,9,12,0.75,6,27,16,7,4,13,17]]
print(rf_reg.predict(pred1))


[83.32]
