<a href="https://colab.research.google.com/github/michiWS1920/nfl_data/blob/master/try3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
library(readr)  # for read_csv
library(knitr)  # for kable
combine.table <- read_csv("https://raw.githubusercontent.com/michiWS1920/nfl_data/master/sources/combine.table.csv")
draft.table <- read_csv("https://raw.githubusercontent.com/michiWS1920/nfl_data/master/sources/draft.table.csv")
college.stats <- read_csv("https://raw.githubusercontent.com/michiWS1920/nfl_data/master/sources/college.stats.csv")

In [None]:
install.packages("textclean")
install.packages("feather")
install.packages("mice")
install.packages("httr")

In [None]:
library(textclean)
library(readr)
library(dplyr)
library(tidyr)
library(feather)
library(stringr)
library(mice)
library(rvest)
library(httr)

In [6]:
## swap the .rec and .rush stats in rushing section  

stat.neu<- college.stats %>%
  filter(section=="rushing") %>%
  mutate(swap(stat, "rec.", "rush.")) 

# reorder columnns
stat.neu <- stat.neu[c(1,2,5,4)] 
colnames(stat.neu)[3] <- "stat"

stat.alt = subset(college.stats, section !="rushing")
college.stats <- rbind(stat.neu, stat.alt)

In [7]:
## calculate mean of college stats

# extract the number of played games in total 
college.stats %>% group_by(url, section) %>% 
  filter(stat=="games") -> games 


left_join(college.stats, games, by = c("url","section"))-> college.stats
college.stats <- select(college.stats, -stat.y) 

# sum of college stats values / number of games
college.stats$value.y <- ((college.stats$value.x)/(college.stats$value.y))


#rename the column names
names(college.stats)[names(college.stats) == "stat.x"] <- "stat"
names(college.stats)[names(college.stats) == "value.x"] <- "value"
names(college.stats)[names(college.stats) == "value.y"] <- "value.mean"

In [8]:
# define relevant parameters per section 
#section == "passing"  = games ; attempts ; completions ; pass.intds; pass.tds; pass.yards
pass = c("games","attempts","completions","pass.intds","pass.tds","pass.yards")

#section == "rushing" = games; rec.td; rec.yards; receptions; rush.att; rush.td; rush.yds; scrim.plays; scrim.tds; scrim.yds
rush = c("games", "rec.td", "rec.yards", "receptions", "rush.att", "rush.td", "rush.yds", "scrim.plays", 
         "scrim.tds", "scrim.yds")

#section == "defense" = games; ast.tackles; fum.forced; fum.rec; fum.tds; fum.yds; int; int.td; tackles; solo.tackles; sacks; loss.tackles; pd
def = c ("games", "ast.tackles", "fum.forced", "fum.rec", "fum.tds", "fum.yds", "int", "int.td", "tackles", 
         "solo.tackles", "sacks", "loss.tackles", "pd")

#section == "receiving" = games; receptions; rec.yards; rec.td; scrim.plays; scrim.tds; scrim.yds
rec = c("games", "receptions", "rec.yards", "rec.td", "scrim.plays", "scrim.tds", "scrim.yds")



# extract relevant parameters per section 
college.stats <- filter(college.stats, (section == "passing" & stat %in% pass) | (section == "rushing" & stat %in% rush)
                        | (section == "defense" & stat %in% def) | (section == "receiving" & stat %in% rec))

In [9]:
## select draft variables and create primary key 
draft.df <- draft.table %>%
  select(player, year, round,  team, age, college, url,
         pick, # order of draft pick 
         pos, # position 
         carav,# weighted Career Approximate Value
         drav) %>% # highest number of draft AV points 
  mutate(key = ifelse(is.na(url), paste(player, year, sep = '-'), url)) %>%
  group_by(key) %>%
  # drop duplicate rows 
  mutate(appearance = row_number()) %>%
  filter(appearance == 1) %>%
  select(-appearance) %>%
  ungroup

In [10]:
combine.df <- combine.table %>%
  select(year_combine = year,
         player_combine = player,
         pos_combine = pos,
         college_combine = college,
         height, weight,
         forty, #40-yard Dash
         vertical, #Vertical Jump
         broad,#Broad Jump
         bench,#Bench Press (225lb for max reps)
         threecone,#Three Cone drill 
         shuttle,#20-yard shuttle
         url_combine = url) %>%
  mutate(key = ifelse(is.na(url_combine), paste(player_combine, year_combine, sep = '-'), url_combine)) %>%
    #rewrite feet and inches
  separate(height, c('feet', 'inches'), sep = '-', convert = TRUE) %>%
  mutate(height = feet * 12 + inches) %>%
  select(-feet, -inches) %>%
  # drop duplicate rows 
  group_by(key) %>%
  mutate(appearance = row_number()) %>%
  filter(appearance == 1) %>%
  select(-appearance) %>%
  ungroup

In [11]:
Reduce(intersect, list(unique(combine.df$key), unique(draft.df$key))) %>% head()

In [12]:
# rename combine.df$key to combine with college.stats and draft.df 
combine.df$key %>%
  str_replace_all("https://", "http://") ->combine.df$key 

In [13]:
# combine draft and combine tables by "key "
df <- full_join(draft.df, combine.df, by = 'key') %>%
  #find the non-missing player, pos, college, url 
  mutate(player = coalesce(player, player_combine), 
         pos = coalesce(pos, pos_combine),
         college = coalesce(college, college_combine),
         year = coalesce(year, year_combine),
         url = coalesce(url, url_combine)) %>%
  # drop combine column names 
  subset(select=-c(player_combine,pos_combine, college_combine, year_combine, url_combine)) 

In [14]:
# assign nan pick values to 257 and nan round number to 8
df$pick[is.na(df$pick)] <- 257
df$pick[is.na(df$round)] <- 8

In [15]:
# rename and drop columns to combine with combine and draft tables
stats.df <- college.stats %>% select(-value) %>% rename(value = value.mean, key = url)

In [16]:
## Convert into long format so we can merge with college stats
training1 <- df %>%
  select(key, carav,
         height, weight,
         forty, vertical,
         bench, age,
         threecone, shuttle,
         broad) %>%
  gather(metric, value, carav,
         height, weight,
         forty, vertical,
         bench, age,
         threecone, shuttle,
         broad) %>%
  filter(!is.na(value), value != '') %>%
  mutate(value = as.numeric(value))

In [17]:
## Impute the missing combine data
## A. Convert to wide
training1a <- training1 %>%
  spread(metric, value, fill = NA)

## B. do the imputation and add back the non-imputed columns
training1b <- complete(mice(training1a %>% select(-key, -carav)))
training1b$key <- training1a$key
training1b$carav <- training1a$carav


 iter imp variable
  1   1  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  1   2  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  1   3  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  1   4  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  1   5  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  2   1  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  2   2  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  2   3  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  2   4  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  2   5  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  3   1  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  3   2  age  bench  broad  forty  height  shuttle  threecone  vertical  weight
  3   3  age  bench 

In [18]:
## C. Convert back to long format
training1c <- training1b %>%
  gather(metric, value, -key)

## Rename some of the collge stats
## make sure we only have one stat per person
training2 <- college.stats %>%
  group_by(url, stat) %>%
  mutate(row = row_number()) %>%
  filter(row == 1) %>%
  select(-row) %>%
  ungroup %>%
  rename(key = url, metric = stat) %>%
  select (-c(section,value)) %>%
  mutate(metric = str_replace_all(metric, '[.]', '_')) %>%
  rename(value = value.mean)

## Convert back into wide form
training3 <- bind_rows(training1c, training2) %>%
  spread(metric, value, fill = 0) ## note we fill zeros, not NAs

In [50]:
## Join the pick/position/college/year/team back on
training <- df %>%
  select(key, player, pick, pos, college, year, team) %>%
  group_by(college) %>%
  inner_join(training3)

Joining, by = "key"



In [23]:
install.packages("mlbench")
install.packages("workflows")
install.packages("tune")
install.packages("tidymodels")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘globals’, ‘hardhat’, ‘parsnip’




In [None]:
library(mlbench)
library(tidyverse)
library(workflows)
library(tune)
library(tidymodels)

In [None]:
training$pick

In [74]:
# Outcome variables
pick <- training$pick
carav <- training$carav
training$first.round <- as.factor(training$pick <= 32)

In [None]:
training$first.round

In [76]:
training %>%
  select(-key, -player, -carav, -team, -pos, -pick)  -> training.df

In [57]:
names(training.df)

In [77]:
set.seed(42) 
training_split = initial_split(training.df, prop = 3/4)

df_train = training(training_split)
df_test = testing(training_split)

df_cv = vfold_cv(df_train)

In [None]:
df_train$first.round

In [78]:
recipe(first.round ~ .,
       data=df_train) %>%
  step_string2factor(all_nominal()) %>%
  step_knnimpute(all_predictors()) %>%
  step_dummy(all_nominal(), -all_outcomes()) -> df_recipe


In [80]:
rand_forest() %>%
  set_args(mtry = tune(),
           trees = 300) %>%
  set_engine('ranger', importance = 'impurity') %>%
  set_mode('classification') -> rf_model


rf_workflow = workflow() %>%
  add_recipe(df_recipe) %>%
  add_model(rf_model)

install.packages("ranger")
library(ranger)
rf_grid = expand.grid(mtry = c(10, 12, 15))

rf_workflow %>%
  tune_grid(resamples = df_cv,
            grid = rf_grid,
            metrics = metric_set(accuracy)) -> rf_tune_results

rf_tune_results %>%
  collect_metrics()

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘RcppEigen’




mtry,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
10,accuracy,binary,0.6998924,10,0.008306141,Model1
12,accuracy,binary,0.6995469,10,0.008884911,Model2
15,accuracy,binary,0.6933634,10,0.00832667,Model3


In [83]:
rf_tune_results %>%
    select_best(metric = 'accuracy') -> param_final

rf_workflow %>%
    finalize_workflow(param_final) -> rf_workflow

In [85]:
rf_workflow %>%
    last_fit(training_split) -> rf_fit

rf_fit %>% collect_metrics()

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.6864363
roc_auc,binary,0.7467362


In [111]:
training.df %>% arrange(year) -> training.df

In [117]:
set.seed(42)
training_split = initial_time_split(training.df, prop = 4/5)

In [118]:
df_train = training(training_split)
df_test = testing(training_split)

df_cv = vfold_cv(df_train)

In [None]:
df_test

In [None]:
rand_forest() %>%
  set_args(mtry = tune(),
           trees = 300) %>%
  set_engine('ranger', importance = 'impurity') %>%
  set_mode('classification') -> rf_model


rf_workflow = workflow() %>%
  add_recipe(df_recipe) %>%
  add_model(rf_model)

#install.packages("ranger")
library(ranger)
rf_grid = expand.grid(mtry = c(10, 12, 15))

rf_workflow %>%
  tune_grid(resamples = df_cv,
            grid = rf_grid,
            metrics = metric_set(accuracy)) -> rf_tune_results

rf_tune_results %>%
  collect_metrics()

In [116]:
rf_workflow %>%
    last_fit(training_split) -> rf_fit

rf_fit %>% collect_metrics()

[31mx[39m [31m: model: Error in ranger::ranger(formula = ..y ~ ., data = data, mtry = ~t...[39m

“All models failed in [fit_resamples()]. See the `.notes` column.”


NULL