In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# College Football Analysis

The college football season is upon us once again - and with it, its peculiarities. Unlike the NFL which is geared towards equality, the CFB season possesses one of the most recurrent hierarchies in football, as solid recruiting classes deliver results and more publicity, thereby giving rise to the creation of future top-tier recruiting classes. In the long run, this creates dynasties like Clemson, Alabama and Ohio State.

But how does all this performance look like in terms of the number of yards gained or lost by each school, and which offensive and defensive metrics are the best predictor of in-game success? Let's look through the dataset to find out more.

Firstly, we need to do some cleaning of the data. The following actions are done on the dataset:
1. Read the file
2. Split the team and conference name
3. Creation of new fields, 'Rush Percentage' and 'Pass Percentage' to determine the proportion of plays of each type
4. Win Percentage, to even out the success of teams as different conferences play a different number of games.

In [None]:
path = '../input/college-football-team-stats-2019'
cfb = []
files = glob.glob(path+'/*.csv')
for file in files:
    x = pd.read_csv(file, index_col = None, header = 0)
    season = '20'+file.split('/')[3].split('.')[0][3:5]
    x['Season']=season
    x['Conference']=x['Team'].str.split('(', expand = True)[1].str.strip(')')
    x['Team']=x['Team'].str.split('(', expand = True)[0]
    cfb.append(x)
    
cfb_data = pd.concat(cfb)
cfb_data = cfb_data.reset_index()
cfb_data['Season'] = pd.to_datetime(cfb_data['Season']).dt.year
cfb_data.rename(columns ={'Feild.Goals':'Field.Goals'},inplace = True)
cfb_data['Pass.Percent']=cfb_data['Pass.Attempts']/(cfb_data['Pass.Attempts']+cfb_data['Rush.Attempts'])
cfb_data['Rush.Percent']=cfb_data['Rush.Attempts']/(cfb_data['Pass.Attempts']+cfb_data['Rush.Attempts'])
cfb_data['Win.Percent']=cfb_data['Win']/cfb_data['Games']
cfb_data.head()

Let's open up with showing how the general success of offenses, measured through Offensive Yards per Play, translates to the number of offensive touchdowns scored and by extension the offensive rank. 

The hue of the scatterplot dictates the offensive rank of a team. Feel free to hover over each point to see the details relating to individual teams.

In [None]:
plt.style.use('fivethirtyeight')
fig = go.Figure()
fig.add_trace(go.Scatter(x=cfb_data['Off.Yards.Play'],y=cfb_data['Off.TDs'],
                          mode = 'markers',
                          marker_color = cfb_data['Off.Rank'],
                          text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str)                       
                          ))

fig.update_layout(title='College Football Offenses')
fig.update_xaxes(title ='Offensive Yards per Play')
fig.update_yaxes(title = 'Offensive Touchdowns')
fig.show()

Looking into the data, there are actually several peculiarities amongst it. The outliers with the highest number of offensive yards per play are the Oklahoma teams of the 2017-19 seasons, with a pass-happy 'Air Raid' offense led by Baker Mayfield, Kyler Murray and Jalen Hurts. It's well known that passing garners more offensive yards per play than rushing (more on that later), and this explains the three OKlahoma teams at the extreme right side of the graph.

Having the most touchdowns is the LSU 2019 team led by Joe Burrow and with may players taken in the first rounds of the NFL draft such as Justin Jefferson. Notably, the 2013, 2014 and 2019 Ohio State teams have relatively high offensive touchdowns despite not having as high offensive yards per play.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=cfb_data['Pass.Yards.Attempt'],y = cfb_data['Pass.Touchdowns'], 
                         mode = 'markers',
                         marker_color = cfb_data['Off.Rank'],
                         showlegend = False,
                        text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str) + " "+ "Passing"))
fig.add_trace(go.Scatter(x=cfb_data['Yards.Rush'],y = cfb_data['Rushing.TD'], 
                         mode = 'markers',
                         marker_color = cfb_data['Off.Rank'],
                         showlegend = False,
                        text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str) + " "+ "Rushing"))
fig.update_layout(title='Rushing and Passing Offensive Stats')
fig.update_xaxes(title = 'Yards per Attempt')
fig.update_yaxes(title = 'Touchdowns')
fig.show()

# Rushing and Passing Offensive Stats

Breaking down the offensive statistics even further into rushing and passing yards, we are able to see this division. Ohio State teams have a relatively high number of rush yards per attempt, as they have been utilising a lot of dual-threat quarterbacks which are able to rush and pass.

Notably, the service academies have the lowest number of passing touchdowns, but have the highest number of yards per attempt. This is the result of them normally running the triple option offense, a run-heavy offense, that any attempts at passing are able to take the opponent by surprise and result in a large yardage gain. There is an [excellent article](https://www.sbnation.com/college-football/2016/12/10/13863464/army-navy-triple-option-offense-flexbone) on SBnation explaining why the service academies run such an offense.

The usual suspects show up for pass-heavy offenses, with the LSU team of 2019 led by Joe Burrow, Ja'Marr Chase and Justin Jefferson, the 2018, 2019 and 2020 Alabama teams with Tua Tagovailoa, Jalen Hurts and Mac Jones, and the Oklahoma teams mentioned earlier.

In [None]:
fig = go.Figure()
size= cfb_data['Off.Rank']
inverse = [1/i for i in size]
fig.add_trace(go.Scatter(x = cfb_data['Pass.Percent'],
                         y=cfb_data['Rush.Percent'],
                         mode = 'markers',
                         marker = dict(size = inverse,color = cfb_data['Off.TDs'],sizeref = 2.*max(cfb_data['Off.Rank'])/(100**2)),
                         text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str)))
fig.update_layout(title ='Pass and Rush Percentages')
fig.update_xaxes(title = 'Pass Percentage')
fig.update_yaxes(title = 'Rush Percentage')
fig.show()

# Do pass and rush percentages have some correlation with success?

The size of each plot is inversely correlated to the offensive rank (i.e. the No 1 ranked offenses will have larger plots), while the colour is correlated to the number of offensive touchdowns. Somehow, Kent St had the #1 ranked offense through 12/27/2020 for the 2020 season; as they only played 4 games due to Covid, the number of touchdowns is much lower. In fact, all the dark-coloured teams that have large bubbles on the graph had such seasons in 2020.

It should be noted that highly successful offenses largely have a pass percentage between **0.35 and 0.45** and a rush percentage between **0.55 and 0.65**. The notable outliers are LSU in 2019, led by the play of Joe Burrow, and Texas Tech in 2016, which was led by Kliff Kingsbury's Air Raid and Patrick Mahomes, who we now know to be one of the best quarterbacks in the NFL.

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

cfb_analysis = cfb_data[['Pass.Attempts','Pass.Completions','Interceptions.Thrown.x','Pass.Yards.Attempt','Rush.Attempts','Yards.Rush','Fumbles.Lost']]
y=cfb_data['Win.Percent']
ss = preprocessing.StandardScaler()
x = ss.fit_transform(cfb_analysis)
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = .3,random_state = 123)


# How well can we predict a team's offensive success using certain metrics?

We will conduct a linear regression amongst selected metrics such as the number of pass completions and the fumbles lost in order to solve this problem. As these metrics have a large variety of absolute values - if any team has the same number of lost fumbles as that of pass attempts, something is very wrong with the team - we have to first utilise the StandardScaler in order to standardise the values. 

Our target variable will be the **win percentage**, the variable that we had created earlier in order to equalise between the different teams played. Our target metric will be that of mean squared error.

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
predictions = lr.predict(x_test)
error = mean_squared_error(predictions, y_test)
print(error)

An error of 0.02297 is definitely not negligible, but it can be considered to be rather good given that the mean win percentage is 0.500. This model is decent at predicting win percentage based on offensive stats - but which features contributed the most to the model?

In [None]:
importances = lr.coef_
feat_importances = pd.Series(importances, index=cfb_analysis.columns)
feat_importances.plot(kind='barh')

In this graph, we are able to see the most significant factors of success and their relative effects on win percentage (positive or negative). The following conclusions can be drawn:

**1. Pass numbers are more significant than the success of passes or the rushing stats**

The most significant two factors are pass completions and pass attempts; on the contrary, the yards per pass and the yards per rush have about the same significance in the model. This might be explained that teams that are able to complete passes are able to get more first downs.

**2. Pass attempts are inversely correlated with win percentage.**

As expected, fumbles and interceptions have a negative outcome on winning. However, passing attempts as a negative indicator seems rather counterintuitive, but makes sense upon further thinking. Teams that attempt more passes may be stuck in longer-yardage situations that takes away the possibility of rushing, or may be trying to move the ball quickly in order to make up for a deficit. 

**3. The best metric may not have been included in the model.**

The best metric may have been the number of pass completions per attempts. This would dictate the efficiency of the offense to a greater extent as a 'connected' metric between the two metrics which have the greatest bearing on win percentage.

# Are we able to achieve better results using lasso regression?

Lasso regression (another excellent article can be found [here](https://towardsdatascience.com/ridge-and-lasso-regression-a-complete-guide-with-python-scikit-learn-e20e34bcbf0b)) is a type of regression that incentivises the usage of models with fewer parameters, and penalises large errors to a greater extent than it does small ones. The objective of lasso is to prevent overfitting, and it normally provides a lower level of MSE for the test set while being able to provide a less complex model. 

By using a range of alpha in lasso, we will plot out the MSE in order to find out the optimal lasso value for a good performance of the model - too low, and it is similar to the linear regression; too high, and we risk simplifying the model too greatly.

In [None]:
lerror = []
al_range = np.arange(0,0.5,0.02)

for al in al_range:
    lasso = Lasso(alpha = al)
    lasso.fit(x_train, y_train)
    lassopred = lasso.predict(x_test)
    lasso_error = mean_squared_error(lassopred,y_test)
    lerror.append(lasso_error)
plt.xlabel('Lasso Alpha')
plt.ylabel('Mean Squared Error')
sns.lineplot(x=al_range, y=lerror)

# Why did lasso not provide results as expected?

Despite the presence of a small dip in MSE with a tiny lasso alpha, the MSE greatly increased after. Since a lasso with 0.0 alpha is essentially a linear regression, it suggests that the linear regression has been optimised to a large extent to reduce MSE. This can be attributed to a few reasons:

**1. Pre-selected independent variables**

In this model, I preselected independent variables instead of using all variables relating to offense in order to minimise duplication. One example of duplication would be that of rushing attempts, rushing yards, and rushing yards per attempt. It does not make sense to keep all three of these variables together, and other variables with either perfect correlation or high correlation were excluded by myself when I was creating the initial linear regression model. Thus, I had inadvertently simplified the model to a large extent such that lasso was not able to create substantial improvements after.

**2. Low number of independent variables**

While the previous point explained why the lasso was not able to give substantial improvements, it does not explain why the MSE drastically increased. This is due to the low number of independent variables meaning that the variables that remained were potentially more significant as predictors, and dropping any one of them meant that the model was weakened as a result. 

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfro = RandomForestRegressor(n_estimators = 150,criterion = 'mse', random_state =123, max_samples = 200, max_features = 'log2')
rfro.fit(x_train, y_train)
rfropredictions = rfro.predict(x_test)
rfroerror = mean_squared_error(rfropredictions, y_test)
print(rfroerror)

Using the Random Forest Regressor, we get a further improvement on the data performance. This is expected as decision tree-based models, such as Random Forest, are likely to give better performance than a simple linear regression.

# Defense Analysis

Now, it is time to do the same for defence - to collate the average performance of defences and find out which predictors are most important for a good defence. First, we do a general analysis of the defences.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=cfb_data['Yards.Play.Allowed'],y=cfb_data['Off.TDs.Allowed'],
                          mode = 'markers',
                          marker_color = cfb_data['Def.Rank'],
                          text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str)                       
                          ))

fig.update_layout(title='College Football Defenses')
fig.update_xaxes(title = 'Yards per Play Allowed')
fig.update_yaxes(title = 'Offensive Touchdowns Allowed')
fig.show()

As seen, there are the normal powerhouses - Clemson and Alabama which allow both very few yards per play and have a low number of offensive TDs allowed. However, the nature of defence as being something that varies from year to year can be seen very clearly in the number of 'one-hit wonders' in the bottom quadrant of the graph, such as Boston College or Mississippi State. This agrees with the general consensus that defensive production is less replicable than offensive production.

Notably, there are some defences with a high Yards per Play allowed but very few offensive touchdowns allowed, such as Miami in 2020. This might be due to certain outlying games - the Miami defence allowing over 500 yards in the season ender of 2020 being one of them, or that colleges like these implemented a 'bend-not-break' defence, where in critical (redzone) situations, they restrict offenses to field goals/turnovers rather than giving up touchdowns.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=cfb_data['Yards.Attempt.Allowed'],y = cfb_data['Opp.Pass.TDs.Allowed'], 
                         mode = 'markers',
                         marker_color = cfb_data['Def.Rank'],
                         showlegend = False,
                        text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str) + " "+ "Opponents Passing"))
fig.add_trace(go.Scatter(x=cfb_data['Yds.Rush.Allowed'],y = cfb_data['Opp.Rush.Touchdowns.Allowed'], 
                         mode = 'markers',
                         marker_color = cfb_data['Def.Rank'],
                         showlegend = False,
                        text = cfb_data['Team'] + " " + cfb_data['Season'].astype(str) + " "+ " Opponents Rushing"))
fig.update_layout(title='Rushing and Passing Defensive Stats')
fig.update_xaxes(title = 'Yards per Attempt Allowed')
fig.update_yaxes(title = 'Touchdowns Allowed')
fig.show()

The above graph shows the differing performance in pass and rush defense. 

# How well can we predict win percentage based on defensive indicators?

Various metrics such as Tackles for Loss, sacks, interceptions and fumbles were chosen for this ML. Similarly to the offensive performance ML, we will be using Linear Regression, lasso and Random Forests.

In [None]:
defense = cfb_data[['Yards.Attempt.Allowed','Yds.Rush.Allowed','Average.Sacks.per.Game','Tackle.For.Loss.Per.Game','Fumbles.Recovered','Opponents.Intercepted']]
ssdef = preprocessing.StandardScaler()
xdef = ssdef.fit_transform(defense)
xdef_train, xdef_test, ydef_train, ydef_test = train_test_split(xdef,y,test_size = 0.3, random_state = 123)

In [None]:
lrdef = LinearRegression()
lrdef.fit(xdef_train, ydef_train)
predictionsdef = lrdef.predict(xdef_test)
errordef = mean_squared_error(predictionsdef, ydef_test)
print(errordef)

In [None]:
importancesd = lrdef.coef_
feat_importancesd = pd.Series(importancesd, index=defense.columns)
feat_importancesd.plot(kind='barh')

We can conclude the following from the results:

**1. The average sacks per game metric was an offensive rather than defensive indicator.**

It is quite inconceivable that the sacks metric would have a negative correlation with that of win percentage, as sacks normally push an opposing quarterback further from the first-down marker and the endzone. The logical conclusion is that this was a measure of the sacks **on** the team's quarterback, rather than a sack **of** the opposing team's quarterback.

Due to the lack of a suitable sack-related indicator within the dataset, I chose to keep it inside. There is code below showing the result if I took out the sacks metric. Of course, as sacks are critical to the progress of a game, there is an increase in MSE when the sacks metric is removed.

**2. Interceptions are the most critical individual play in determining win percentage**

This was expected as interceptions are always live and result in a turnover of the ball. Unlike fumbles, which are live but due to the large number of people in the vicinity usually results in the recovering team being downed by contact, and tackles for loss, which end the play, interceptions can result in a solid gain of yardage in addition to recovering the ball.

**3. Consistent defensive performance is more critical than 'splash' plays**

Taking out sacks, the two most critical indicators are that of the passing yardage and rushing yardage allowed (as expected). While the impact of plays that result in interceptions is greater on an individual level as compared to forcing an incompletion, it is more important to hold an opposing offense to lower levels of yardage.

Below, we will use the Lasso to find if any improvements can be made.

In [None]:
lerrordef = []
al_rangedef = np.arange(0,1,0.02)

for al in al_rangedef:
    lassodef = Lasso(alpha = al)
    lassodef.fit(xdef_train, ydef_train)
    lassodefpred = lassodef.predict(xdef_test)
    lassodef_error = mean_squared_error(lassodefpred,ydef_test)
    lerrordef.append(lassodef_error)
plt.xlabel('Lasso Alpha')
plt.ylabel('Mean Squared Error')
sns.lineplot(x=al_rangedef, y=lerrordef)

Lasso does not give us improved performance, likely due to the same reasons as mentioned earlier in the offensive analysis section. Now, we will try to improve on the model with Random Forests.

In [None]:

rfr = RandomForestRegressor(n_estimators = 150,criterion = 'mse', random_state =123, max_samples = 150, max_features = 'log2')
rfr.fit(xdef_train, ydef_train)
rfrpredictions = rfr.predict(xdef_test)
rfrerror = mean_squared_error(rfrpredictions, ydef_test)
print(rfrerror)

# Why did Random Forests not work?

Surprisingly, Random Forests does not give a better performance than that of linear regression. We need to find out why, as decision trees' performance is usually higher than that of a 'vanilla' linear regression. First, we will look at the feature importances of each item within the Random Forest model.

In [None]:
rfrimportancesd = rfr.feature_importances_
rfrfeat_importancesd = pd.Series(importancesd, index=defense.columns)
rfrfeat_importancesd.plot(kind='barh')

The Random Forest model ascribes similar importances to each feature as that of the linear regression. Thus, it is unlikely that the feature weights are the source of poorer performance. Rather, it is possible that the nature of the data would have resulted in Random Forests performing worse - but we first need to understand how Random Forests work.

![](https://scikit-learn.org/stable/_images/sphx_glr_plot_iris_dtc_0021.png)

As shown by this picture from scikit-learn (source: https://scikit-learn.org/stable/_images/sphx_glr_plot_iris_dtc_0021.png), Random Forests make a decision based on a certain threshold value (if x is larger than 2, then we will follow tree A; if not, we will follow tree B). This has certain drawbacks, as it does not react well to data with many outliers present in the test set. Random Forests are not as capable of extrapolation of data as compared to a linear regression as a result.

In [None]:
fig = make_subplots(rows = 3, cols = 2)

fig.add_trace(go.Histogram(x=defense['Opponents.Intercepted'],histnorm = 'probability'),row = 1, col =1)
fig.add_trace(go.Histogram(x=defense['Fumbles.Recovered'],histnorm = 'probability'),row = 1, col =2)
fig.add_trace(go.Histogram(x=defense['Tackle.For.Loss.Per.Game'],histnorm = 'probability'),row = 2, col =1)
fig.add_trace(go.Histogram(x=defense['Average.Sacks.per.Game'],histnorm = 'probability'),row = 2, col =2)
fig.add_trace(go.Histogram(x=defense['Yds.Rush.Allowed'],histnorm = 'probability'),row = 3, col =1)
fig.add_trace(go.Histogram(x=defense['Yards.Attempt.Allowed'],histnorm = 'probability'),row = 3, col =2)
fig.update_layout(title='Independent Variables in Defense Analysis')
fig.update_xaxes(title = 'Normalized Interceptions', row = 1, col =1)
fig.update_xaxes(title = 'Normalized Fumbles Recovered', row = 1, col =2)
fig.update_xaxes(title = 'Normalized Tackles for Loss', row = 2, col = 1)
fig.update_xaxes(title = 'Normalized Average Sacks', row = 2, col = 2)
fig.update_xaxes(title = 'Normalized Yards per Rush Allowed', row = 3, col =1)
fig.update_xaxes(title = 'Normalized Pass Yards per Attempt Allowed', row = 3, col = 2)

fig.show()

As seen from the subplots of data, while the Yards per Rush generally follows a normal distribution, visually it is obvious that the pass yards per attempt allowed metric does not really resemble that of a normal distribution - implying the presence of substantial outliers which lie far from the central peak. Given that the pass yards per attempt metric was one of the most significant metrics in the model, it follows that Random Forests would be impaired in their ability to draw conclusions from the data as a result.

Below are the MSEs from the linear regression and random forests without the sacks metric included. As the yards per attempt metric is still present, Random Forests fail to provide better performance than the ordinary linear regression.

In [None]:
defense2 = cfb_data[['Yards.Attempt.Allowed','Yds.Rush.Allowed','Tackle.For.Loss.Per.Game','Fumbles.Recovered','Opponents.Intercepted']]
ssdef2 = preprocessing.StandardScaler()
xdef2 = ssdef2.fit_transform(defense2)
xdef_train2, xdef_test2, ydef_train2, ydef_test2 = train_test_split(xdef2,y,test_size = 0.3, random_state = 123)
lrdef2 = LinearRegression()
lrdef2.fit(xdef_train2, ydef_train2)
predictionsdef2 = lrdef2.predict(xdef_test2)
errordef2 = mean_squared_error(predictionsdef2, ydef_test2)
print(errordef2)

In [None]:
rfr2 = RandomForestRegressor(n_estimators = 150,criterion = 'mse', random_state =123, max_samples = 150, max_features = 'log2')
rfr2.fit(xdef_train2, ydef_train2)
rfrpredictions2 = rfr2.predict(xdef_test2)
rfrerror2 = mean_squared_error(rfrpredictions2, ydef_test2)
print(rfrerror2)

Thank you, and I hope you liked the notebook!