NFL Big Data Bowl 2022 

Andrew Curtis Project

Effect of weight and height of Kick Returners on Return Yardage

In [None]:
!pip install pyspark
!pip install -U -q PyDrive 
!apt install openjdk-8-jdk-headless -qq --yes
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
import numpy as np
from pyspark import SparkContext, SparkFiles
from pyspark.sql import SparkSession
import string
import matplotlib.pyplot as plt
from pyspark.sql.functions import split
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.master("local").appName("NFL").getOrCreate()


In [None]:
playerData = spark.read.csv('../input/nfl-big-data-bowl-2022/players.csv', header = True)
playData = spark.read.csv('../input/nfl-big-data-bowl-2022/plays.csv', header = True)

Data Pre-processing and Cleaning

In [None]:
#filter for only kick return plays and non-null returners
returnData = playData.filter(playData.kickReturnYardage != 'NA').filter(playData.returnerId != 'NA').drop("gameId", "playId", "quarter", "possessionTeam", "yardlineSide", "yardlineNumber", "gameClock", "penaltyJerseyNumbers", "preSnapHomeScore", "preSnapVisitorScore", "passResult", "absoluteYardlineNumber")

reducedPlayerData = playerData.drop("birthDate", "collegeName", "Position", "displayName") #data cleaning

returnData = returnData.withColumnRenamed('returnerId', 'nflId') #to match player ID's

#merge data sets
merged3 = returnData.join(reducedPlayerData, returnData.nflId == reducedPlayerData.nflId)
merged3 = merged3.filter(merged3.weight != 'NA').filter(merged3.height != 'NA')
merged3.show(5) 

In [None]:
#pre-processing for height and weight
new_height = merged3.withColumn("height_feet", split(col("height"), "-").getItem(0)).withColumn("height_inch", split(col("height"), "-").getItem(1))
new_height = new_height.withColumn("height_feet", new_height["height_feet"].cast(IntegerType()))
new_height = new_height.withColumn("height_inch", new_height["height_inch"].cast(IntegerType()))
new_height = new_height.withColumn("weight", new_height["weight"].cast(IntegerType()))
new_height = new_height.withColumn("kickReturnYardage", new_height["kickReturnYardage"].cast(IntegerType()))

new_height.show(5)

In [None]:
#more height and weight pre-processing
new_height = new_height.replace(4, 48, "height_feet")
new_height = new_height.replace(5, 60, "height_feet")
new_height = new_height.replace(6, 72, "height_feet")
new_height = new_height.replace(7, 84, "height_feet")
new_height = new_height.na.fill(value=0, subset=["height_inch"])

#final cleaned data for use
fixedData = new_height.withColumn("totalHeight", col("height_feet")+col("height_inch"))
fixedData.show(10)


In [None]:
#unused dataframe with just relavent data, for display purposes
#returnYardsHeight = fixedData.drop("playDescription", "down", "yardsToGo", "specialTeamsPlayType", "specialTeamsResult", "kickerId", "nflId", "kickBlockerId", "penaltyCodes", "penaltyYards", "kickLength", "playResult", "height", "weight", "height_feet", "height_inch")

Linear Regression Models

In [None]:
#multiple linear regression model
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['weight', 'totalHeight'], outputCol = 'features')
regression_df = vectorAssembler.transform(fixedData)
regression_df = regression_df.select(['features', 'kickReturnYardage'])
regression_df.show(3)

In [None]:
#multiple linear regression results
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='kickReturnYardage')
lr_model = lr.fit(regression_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
#multiple linear regression R**2
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
#information regarding kick returns in data set
regression_df.describe().show()

In [None]:
#simple linear regression with weight model
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['weight'], outputCol = 'features2')
regression_df2 = vectorAssembler.transform(fixedData)
regression_df2 = regression_df2.select(['features2', 'kickReturnYardage'])
regression_df2.show(3)

In [None]:
#weight linear regression results
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features2', labelCol='kickReturnYardage')
lr_model = lr.fit(regression_df2)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
#weight linear regression r**2
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
#simple linear regression using height model
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['totalHeight'], outputCol = 'features3')
regression_df3 = vectorAssembler.transform(fixedData)
regression_df3 = regression_df3.select(['features3', 'kickReturnYardage'])
regression_df3.show(3)

In [None]:
#height linear regression results
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features3', labelCol='kickReturnYardage')
lr_model = lr.fit(regression_df3)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
#height linear regression r**2
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Plotting

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

#plot for distribution of weights

x = np.array(fixedData.select('weight').collect())
x = x[np.logical_not(np.isnan(x))]

y = np.array(fixedData.select('kickReturnYardage').collect())
y = y[np.logical_not(np.isnan(y))]

# Calculate the point density
# xy = np.vstack([x,y])
# z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
# idx = z.argsort()
# x, y, z = x[idx], y[idx], z[idx]

plt.hist(x, bins =30, color = 'blue')
plt.xlabel('Weight (lbs)', fontsize=16)
plt.ylabel('counts', fontsize=16)
plt.title('Distribution of Weights of Kick Returners', fontsize=16)
plt.show()

In [None]:
#plot for distribution of heights
x = np.array(fixedData.select('totalHeight').collect())
x = x[np.logical_not(np.isnan(x))]

y = np.array(fixedData.select('kickReturnYardage').collect())
y = y[np.logical_not(np.isnan(y))]

# Calculate the point density
# xy = np.vstack([x,y])
# z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
# idx = z.argsort()
# x, y, z = x[idx], y[idx], z[idx]

plt.hist(x, bins =30, color = 'blue')
plt.xlabel('Height (inches)', fontsize=16)
plt.ylabel('counts', fontsize=16)
plt.title('Distribution of Heights of Kick Returners', fontsize=16)
plt.show()

In [None]:
#plot for distribution of kick return yardages
x = np.array(fixedData.select('weight').collect())
x = x[np.logical_not(np.isnan(x))]

y = np.array(fixedData.select('kickReturnYardage').collect())
y = y[np.logical_not(np.isnan(y))]

# Calculate the point density
# xy = np.vstack([x,y])
# z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
# idx = z.argsort()
# x, y, z = x[idx], y[idx], z[idx]

plt.hist(y, bins =30, color = 'blue')
plt.xlabel('Kick Return Yardage', fontsize=16)
plt.ylabel('counts', fontsize=16)
plt.title('Distribution of Kick Return Yardages', fontsize=16)
plt.show()

In [None]:
#density plot of kick return yardage vs weight
x = np.array(fixedData.select('weight').collect())
x = x[np.logical_not(np.isnan(x))]

y = np.array(fixedData.select('kickReturnYardage').collect())
y = y[np.logical_not(np.isnan(y))]

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
plt.xlabel('Weight (lbs)', fontsize=16)
plt.ylabel('Kick Return Yardage', fontsize=16)
plt.title('Kick Return Yardage vs. Weight', fontsize=16)
plt.show()

In [None]:
#density plot of kick return yardage vs height
x = np.array(fixedData.select('totalHeight').collect())
x = x[np.logical_not(np.isnan(x))]

y = np.array(fixedData.select('kickReturnYardage').collect())
y = y[np.logical_not(np.isnan(y))]

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
plt.xlabel('Height (inches)', fontsize=16)
plt.ylabel('Kick Return Yardage', fontsize=16)
plt.title('Kick Return Yardage vs. height', fontsize=16)
plt.show()