This is the offline notebook that will pull the information in from csv files and peform the modeling.

In [156]:
import pandas as pd
import numpy as np

nfl_qb_df = pd.read_csv('master_nfl_qb_df_filter.csv')
college_qb_df = pd.read_csv('master_qb_df.csv')
print('NFL df shape:', nfl_qb_df.shape)
print('College df shape:', college_qb_df.shape)

NFL df shape: (174, 129)
College df shape: (1438, 54)


Now that we have the nfl and college qb df's loaded, we need to do a little data processing with the nfl dataframe to remove the non quarterbacks that are still remaining.

In [157]:
nfl_qb_df = nfl_qb_df[nfl_qb_df['qb_record'].notnull()]
print('NFL df shape:', nfl_qb_df.shape)

NFL df shape: (158, 129)


1. Need to get list of Pro-Bowl QB's and add that to the nfl qb dataframe.  
2. Then I will need a subset of qb's that have played at least 5 years, (started at least half their games), and have gone to a probowl.  
3. From there, I can then make a column that says "Success, true/false" and then send that back to the college qb df.

In [158]:
nfl_qb_df['Player']

0        Rodney Peete
2        Charlie Frye
3         Chris Simms
6       Brodie Croyle
7        Chris Weinke
            ...      
168       Nick Foles 
169    Jameis Winston
170         Matt Ryan
171       Elvis Grbac
173     Philip Rivers
Name: Player, Length: 158, dtype: object

In [159]:
# we will read in the original dataset of the NFL QB's to get the list of Pro Bowl Quarterbacks

original_nfl_qb_df = pd.read_excel('NFL_QBS_2000-2015.xlsx', sheet_name='OriginalData')

def probowl(qb_name):
    if '*' in qb_name:
        return True
    else:
        return False

original_nfl_qb_df = original_nfl_qb_df[original_nfl_qb_df['Pos'] != 'wr']
original_nfl_qb_df = original_nfl_qb_df[(original_nfl_qb_df['Pos'] == 'QB') | (original_nfl_qb_df['Pos'] == '/qb') | (original_nfl_qb_df['Pos'].isnull()) | (original_nfl_qb_df['Pos'] == 'qb')]

original_nfl_qb_df['ProBowl'] = original_nfl_qb_df['Player'].apply(probowl)


nfl_pro_bowl_qb_list = original_nfl_qb_df[original_nfl_qb_df['ProBowl'] == True]

nfl_pro_bowl_qb_list = nfl_pro_bowl_qb_list['Player'].to_list()
print(len(nfl_pro_bowl_qb_list))
nfl_pro_bowl_qb_list = [x.replace("+", "") for x in nfl_pro_bowl_qb_list]
nfl_pro_bowl_qb_list = [x.replace("*", "") for x in nfl_pro_bowl_qb_list]
nfl_pro_bowl_qb_list = list(set(nfl_pro_bowl_qb_list))

# now that we have our list, we can create our column in the nfl_qb_df that shows those quarterbacks who have been selected to a pro-bowl
pro_bowl_qb = pd.DataFrame({'name': nfl_pro_bowl_qb_list, 'Boolean': True})
nfl_qb_df['Pro Bowl Selection'] = nfl_qb_df['Player'].isin(pro_bowl_qb['name'])
nfl_qb_df['Pro Bowl Selection']





195


0      False
2      False
3      False
6      False
7      False
       ...  
168     True
169     True
170     True
171     True
173     True
Name: Pro Bowl Selection, Length: 158, dtype: bool

In [160]:
'''# Now we need a column denoting those NFL QB's who have started at least half of their games 
(datframe is already filterd to those qb's that have played at least 5 years).'''

# Successful QB column
# I believe it is safe to assume that if a qb has made the pro-bowl, they have started at least half of their
# games, so we will make the column denote 1 for those that 

nfl_qb_df['Pct_Games_Started'] = nfl_qb_df['games_started']/nfl_qb_df['games']
# nfl_qb_df['Pct_Games_Started']

def start_hurdle(percentage):
    if percentage >= 0.50: 
        return 1 
    else: 
        return 0

nfl_qb_df['Successful_QB'] = nfl_qb_df['games_started'].apply(start_hurdle)


In [161]:
# Now we can carry this list of qb's back to the college dataframe to denote those quarterbacks that were successful.

# master_qb_df.csv has the list of names under the column 'name'
# nfl_qb_df_filter has list of names under the column 'Player'

success_nfl_qb_df = nfl_qb_df[['Player', 'Successful_QB']]
print(success_nfl_qb_df.shape)

final_df = college_qb_df.merge(success_nfl_qb_df, left_on='name', right_on='Player', how='left')
final_df['Successful_QB'] = final_df['Successful_QB'].fillna(0)
final_df['Player'] = final_df['Player'].fillna(final_df['name'])



(158, 2)


In [162]:

# Delete columns containing either 35% or more than 35% NaN Values
perc = 35.0
min_count =  int(((100-perc)/100)*final_df.shape[0] + 1)
final_df = final_df.dropna( axis=1, 
                thresh=min_count)
final_df = final_df.drop(columns=['Unnamed: 0'])
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1438 entries, 0 to 1437
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   adjusted_yards_per_attempt        1365 non-null   float64
 1   completed_passes                  1365 non-null   float64
 2   interceptions_thrown              1365 non-null   float64
 3   name                              1438 non-null   object 
 4   pass_attempts                     1365 non-null   float64
 5   passing_completion                1365 non-null   float64
 6   passing_touchdowns                1365 non-null   float64
 7   passing_yards                     1365 non-null   float64
 8   passing_yards_per_attempt         1365 non-null   float64
 9   player_id                         1438 non-null   object 
 10  plays_from_scrimmage              1381 non-null   float64
 11  points                            987 non-null    float64
 12  positi

In [167]:
# Ready for feature importance selection.
from sklearn.linear_model import LinearRegression

# need to drop columns that are something other than numbers (name, player_id, position, season, Player)
# final_df = final_df.drop(columns=['name', 'player_id', 'position', 'season', 'team_abbreviation','Player'])
# final_df = final_df.reset_index()

final_df = final_df.fillna(0)
print(final_df.info())
model = LinearRegression()
X = final_df.loc[:,final_df.columns != 'Successful_QB']
y = final_df['Successful_QB']

model.fit(X,y)
importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1438 entries, 0 to 1437
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   level_0                           1438 non-null   int64  
 1   index                             1438 non-null   int64  
 2   adjusted_yards_per_attempt        1438 non-null   float64
 3   completed_passes                  1438 non-null   float64
 4   interceptions_thrown              1438 non-null   float64
 5   pass_attempts                     1438 non-null   float64
 6   passing_completion                1438 non-null   float64
 7   passing_touchdowns                1438 non-null   float64
 8   passing_yards                     1438 non-null   float64
 9   passing_yards_per_attempt         1438 non-null   float64
 10  plays_from_scrimmage              1438 non-null   float64
 11  points                            1438 non-null   float64
 12  quarte