Load in data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('no_null_df.csv')

# Drop 'Unnamed: 0' index column

df = df.drop(columns=['Unnamed: 0'])

Separate String Variables that can't be used for Linear Regression

In [2]:
# Separate String variables that can't be used for Linear Regression

string_variables = ['Player', 'Tm', 'Season', 'PosRank']
X_string_features = df[string_variables]


# Collect data for all variables being used in Linear Regression

X = df.drop(columns=string_variables)

print(X.dtypes)

Age                         int64
PPR/G_prev                float64
avg_depth_of_target       float64
caught_percent            float64
grades_offense            float64
grades_pass_route         float64
route_rate                float64
slot_rate                 float64
wide_rate                 float64
inline_rate               float64
yprr                      float64
man_grades_pass_route     float64
zone_grades_pass_route    float64
grades_pass               float64
pbe                       float64
PPR/G                     float64
Pass Plays/G              float64
Receptions/G              float64
Routes/G                  float64
Targets/G                 float64
TD/G                      float64
YDS/G                     float64
YAC/G                     float64
RZ Targets/G              float64
P_Att/G                   float64
P_Cmp/G                   float64
P_TD/G                    float64
P_Yds/G                   float64
dtype: object


Test H2O Models

In [3]:
import h2o
from h2o.automl import H2OAutoML

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(X, test_size=0.2)

# Convert into h2o frames
train_df = h2o.H2OFrame(train_df)
test_df = h2o.H2OFrame(test_df)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=train_df.columns[:-1], y='PPR/G', training_frame=train_df)

# View the AutoML Leaderoard
lb = aml.leaderboard
lb.head(rows=lb.nrows)



ModuleNotFoundError: No module named 'h2o'

Analyze Results

In [None]:
aml.leader

        Actual  Predicted
521   6.800000   6.313697
941   6.176471  11.124057
741   3.542857   5.098772
980   2.511111   4.122518
411   2.700000   2.996341
..         ...        ...
332  13.568750  15.501992
208  16.337500  12.644738
992   1.436364   2.855296
78    3.388889   6.259422
29   19.166667  14.042999

[201 rows x 2 columns]
                   Feature  Coefficient
0                      Age    -0.620083
1               PPR/G_prev     0.741687
2      avg_depth_of_target     0.013502
3           caught_percent    -0.256697
4           grades_offense    -1.512350
5        grades_pass_route     2.844687
6               route_rate     0.109216
7                slot_rate    -0.311258
8                wide_rate    -0.225327
9              inline_rate     0.104785
10                    yprr    -0.496305
11   man_grades_pass_route    -0.152096
12  zone_grades_pass_route    -0.464664
13             grades_pass     0.277483
14                     pbe     0.140150
15            Pass Plays

In [None]:
aml.leader.model_performane(test_df)

In [None]:
predictions = aml.leader.predict(test_df)
test_df['Predictions'] = predictions
print(test_df)
h2o.export_file(test_df, 'leader_predictions.csv')