<a href="https://colab.research.google.com/github/rafabandoni/nfl-predict/blob/main/notebooks/00_nfl_predict_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import pandas as pd

# 00. Loading and Cleaning Data

## Creating dataframes

In [67]:
# Iter over a list so we don't need to call each df individually
data_list = [
  'defense_downs',
  'defense_fumbles',
  'defense_interceptions',
  'defense_passing',
  'defense_receiving',
  'defense_rushing',
  'defense_scoring',
  'defense_tackles',
  'offense_downs',
  'offense_passing',
  'offense_receiving',
  'offense_rushing',
  'offense_scoring',
  'special-teams_field-goals',
  'special-teams_kickoff-returns',
  'special-teams_kickoffs',
  'special-teams_punt-returns',
  'special-teams_punting',
  'special-teams_scoring',
]

In [68]:
dataframe_dict = {}
for item in data_list:
  data = pd.read_csv(f'https://raw.githubusercontent.com/rafabandoni/nfl-predict/refs/heads/main/data/{item}.csv')
  dataframe_dict[item] = data

In [69]:
# Unpack dict into each of the dataframe variables
(
  defense_downs,
  defense_fumbles,
  defense_interceptions,
  defense_passing,
  defense_receiving,
  defense_rushing,
  defense_scoring,
  defense_tackles,
  offense_downs,
  offense_passing,
  offense_receiving,
  offense_rushing,
  offense_scoring,
  special_teams_field_goals,
  special_teams_kickoff_returns,
  special_teams_kickoffs,
  special_teams_punt_returns,
  special_teams_punting,
  special_teams_scoring
) = tuple(dataframe_dict.values())

In [70]:
# Test
offense_rushing.head()

Unnamed: 0,Team,Att,Rush Yds,YPC,TD,20+,40+,Lng,Rush 1st,Rush 1st%,Rush FUM,year
0,Chiefs,556,2627,4.7,13,15,4,80T,129,23.2,7,2010
1,Jets,534,2374,4.4,14,11,2,53,118,22.1,11,2010
2,Jaguars,512,2395,4.7,14,14,1,74,148,28.9,8,2010
3,Raiders,504,2494,5.0,19,27,6,71,113,22.4,9,2010
4,Falcons,497,1891,3.8,14,12,1,55,111,22.3,4,2010


## Cleaning data

### Joining dataframes

So we can work with all as one.

Let's start joining and treating all defense data.

In order to do better code and avoid repetitions, let's create a function!

**Note**: In class, let's first create the code and then the function

In [74]:
def get_df_name(df):
  name =[x for x in globals() if globals()[x] is df][0]
  return name

def change_dataframes_columns(dataframes_list):
  keep_names = ['year', 'Team']
  for df in dataframes_list:
    df_name = get_df_name(df)
    new_columns = []
    for column in df.columns:
      if not column in (keep_names):
        new_name = f'{df_name}_{column}'
      else:
        new_name = column
      new_columns.append(new_name)
    df.columns = new_columns

def merge_dataframes(dataframes_list):
  new_dataframe = dataframes_list[0].copy()
  for df in dataframes_list[1:]:
    new_dataframe = new_dataframe.merge(df,
                                        on=['year', 'Team'],
                                        how='left')
  return new_dataframe

In [75]:
defense_dfs = [
  defense_downs,
  defense_fumbles,
  defense_interceptions,
  defense_passing,
  defense_receiving,
  defense_rushing,
  defense_scoring,
  defense_tackles,
]

change_dataframes_columns(defense_dfs)
defense_dataframe = merge_dataframes(defense_dfs)

defense_dataframe.head()

Unnamed: 0,Team,defense_downs_3rd Att,defense_downs_3rd Md,defense_downs_4th Att,defense_downs_4th Md,defense_downs_Rec 1st,defense_downs_Rec 1st%,defense_downs_Rush 1st,defense_downs_Rush 1st%,defense_downs_Scrm Plys,...,defense_rushing_Rush 1st,defense_rushing_Rush 1st%,defense_rushing_Rush FUM,defense_scoring_FR TD,defense_scoring_SFTY,defense_scoring_INT TD,defense_tackles_Sck,defense_tackles_Comb,defense_tackles_Asst,defense_tackles_Solo
0,Lions,198,77,12,5,187,56.7,98,22.1,1005,...,98,22.1,8,1,1,2,44,949,246,743
1,Falcons,201,79,17,9,183,50.3,87,23.8,957,...,87,23.8,8,1,0,2,31,873,179,686
2,Rams,221,74,12,6,191,58.2,88,21.8,1017,...,88,21.8,5,0,0,0,43,861,127,770
3,Jets,219,81,6,4,169,62.8,70,17.2,979,...,70,17.2,12,0,0,3,40,835,181,633
4,Dolphins,226,84,22,8,166,57.6,84,18.8,988,...,84,18.8,4,1,1,0,39,869,156,739


In [80]:
offense_dfs = [
  offense_downs,
  offense_passing,
  offense_receiving,
  offense_rushing,
  offense_scoring,
]

change_dataframes_columns(offense_dfs)
offense_dataframe = merge_dataframes(offense_dfs)

offense_dataframe.head()

Unnamed: 0,Team,offense_downs_offense_downs_3rd Att,offense_downs_offense_downs_3rd Md,offense_downs_offense_downs_4th Att,offense_downs_offense_downs_4th Md,offense_downs_offense_downs_Rec 1st,offense_downs_offense_downs_Rec 1st%,offense_downs_offense_downs_Rush 1st,offense_downs_offense_downs_Rush 1st%,offense_downs_offense_downs_Scrm Plys,...,offense_rushing_offense_rushing_20+,offense_rushing_offense_rushing_40+,offense_rushing_offense_rushing_Lng,offense_rushing_offense_rushing_Rush 1st,offense_rushing_offense_rushing_Rush 1st%,offense_rushing_offense_rushing_Rush FUM,offense_scoring_offense_scoring_Rsh TD,offense_scoring_offense_scoring_Rec TD,offense_scoring_offense_scoring_Tot TD,offense_scoring_offense_scoring_2-PT
0,Lions,242,97,16,10,199,52.0,83,20.5,1064,...,9,2,45,83,20.5,6,11,26,41,2
1,Falcons,240,112,15,11,200,55.4,111,22.3,1097,...,12,1,55,111,22.3,4,14,28,47,2
2,Rams,235,78,15,8,179,50.6,84,19.6,1053,...,9,1,42T,84,19.6,2,9,18,27,0
3,Jets,235,93,14,5,171,59.4,118,22.1,1087,...,11,2,53,118,22.1,11,14,20,39,1
4,Dolphins,230,92,10,3,189,56.4,91,20.4,1040,...,5,2,51,91,20.4,12,8,17,26,0


In [77]:
special_teams_dfs = [
  special_teams_field_goals,
  special_teams_kickoff_returns,
  special_teams_kickoffs,
  special_teams_punt_returns,
  special_teams_punting,
  special_teams_scoring
]

change_dataframes_columns(special_teams_dfs)
special_teams_dataframe = merge_dataframes(special_teams_dfs)

special_teams_dataframe.head()

Unnamed: 0,Team,special_teams_field_goals_FGM,special_teams_field_goals_Att,special_teams_field_goals_FG %,special_teams_field_goals_1-19 > A-M,special_teams_field_goals_20-29 > A-M,special_teams_field_goals_30-39 > A-M,special_teams_field_goals_40-49 > A-M,special_teams_field_goals_50-59 > A-M,special_teams_field_goals_60+ > A-M,...,special_teams_punting_40+,special_teams_punting_Lng,special_teams_punting_Sck,special_teams_punting_SckY,data_FGM,data_FG %,data_XPM,data_XP Pct,data_KRet TD,data_PRet T
0,Rams,33,39,84.6,0_0,12_11,14_12,9_7,4_3,0_0,...,4,49,34,244,33,84.6,26,96.3,0,0
1,Raiders,33,41,80.5,0_0,8_8,14_13,12_8,7_4,0_0,...,12,73T,44,291,33,80.5,43,100.0,3,0
2,Eagles,32,38,84.2,0_0,12_12,12_10,11_9,3_1,0_0,...,15,91,49,309,32,84.2,47,100.0,0,1
3,Dolphins,30,41,73.2,1_1,9_9,5_5,18_11,6_3,2_1,...,4,57T,38,228,30,73.2,25,100.0,0,0
4,Jets,30,39,76.9,1_1,11_10,16_14,6_3,4_2,1_0,...,11,74,28,178,30,76.9,37,100.0,2,0


## Cleaning data

Now that we already have 3 main datasets, we can start cleaning data! Cleaning data goes through some process, as such:
1. Check null values
2. Drop unused columns
3. Rename columns and/or values if needed
4. Check overall data

In [82]:
# Cleaning % data
def clean_percent_data(df):
  for column in df.columns:
    if '%' in column:
      df[column] = df[column] / 100

In [88]:
# Remove special characters from columns
def remove_special_char_columns(df):
  for column in df.columns:
    new_name = column.lower().replace(' ','_').replace('%', '_perc')
    df.rename(columns={
        column : new_name
    }, inplace=True)

### Cleaning defense data

In [83]:
clean_percent_data(defense_dataframe)

In [89]:
remove_special_char_columns(defense_dataframe)

In [90]:
defense_dataframe.head()

Unnamed: 0,team,defense_downs_3rd_att,defense_downs_3rd_md,defense_downs_4th_att,defense_downs_4th_md,defense_downs_rec_1st,defense_downs_rec_1st_perc_,defense_downs_rush_1st,defense_downs_rush_1st_perc_,defense_downs_scrm_plys,...,defense_rushing_rush_1st,defense_rushing_rush_1st_perc_,defense_rushing_rush_fum,defense_scoring_fr_td,defense_scoring_sfty,defense_scoring_int_td,defense_tackles_sck,defense_tackles_comb,defense_tackles_asst,defense_tackles_solo
0,Lions,198,77,12,5,187,0.567,98,0.221,1005,...,98,0.221,8,1,1,2,44,949,246,743
1,Falcons,201,79,17,9,183,0.503,87,0.238,957,...,87,0.238,8,1,0,2,31,873,179,686
2,Rams,221,74,12,6,191,0.582,88,0.218,1017,...,88,0.218,5,0,0,0,43,861,127,770
3,Jets,219,81,6,4,169,0.628,70,0.172,979,...,70,0.172,12,0,0,3,40,835,181,633
4,Dolphins,226,84,22,8,166,0.576,84,0.188,988,...,84,0.188,4,1,1,0,39,869,156,739


In [79]:
defense_dataframe.describe()

Unnamed: 0,defense_downs_3rd Att,defense_downs_3rd Md,defense_downs_4th Att,defense_downs_4th Md,defense_downs_Rec 1st,defense_downs_Rec 1st%,defense_downs_Rush 1st,defense_downs_Rush 1st%,defense_downs_Scrm Plys,year,...,defense_rushing_Rush 1st,defense_rushing_Rush 1st%,defense_rushing_Rush FUM,defense_scoring_FR TD,defense_scoring_SFTY,defense_scoring_INT TD,defense_tackles_Sck,defense_tackles_Comb,defense_tackles_Asst,defense_tackles_Solo
count,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,...,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0
mean,212.808333,83.325,17.945833,9.10625,193.245833,54.745417,101.2875,23.170833,1036.495833,2017.0,...,101.2875,23.170833,6.58125,0.75,0.439583,1.41875,38.439583,974.177083,293.533333,684.752083
std,12.94362,9.908907,6.588183,4.146982,20.256136,3.796078,18.750216,2.860184,48.749027,4.325001,...,18.750216,2.860184,2.750149,0.888338,0.681163,1.281661,8.23763,83.856896,79.167926,50.491065
min,176.0,47.0,4.0,0.0,136.0,44.3,53.0,15.0,921.0,2010.0,...,53.0,15.0,1.0,0.0,0.0,0.0,13.0,811.0,109.0,570.0
25%,204.0,76.75,13.0,6.0,180.75,52.175,88.0,21.2,999.0,2013.0,...,88.0,21.2,5.0,0.0,0.0,0.0,33.0,912.0,234.0,648.0
50%,213.0,83.0,17.0,9.0,192.0,54.7,100.0,23.3,1033.0,2017.0,...,100.0,23.3,6.0,1.0,0.0,1.0,39.0,962.5,291.0,678.5
75%,222.0,89.25,22.0,12.0,205.25,57.3,112.25,25.1,1073.25,2021.0,...,112.25,25.1,8.0,1.0,1.0,2.0,44.0,1032.25,353.0,717.25
max,250.0,112.0,41.0,22.0,253.0,66.1,179.0,30.4,1201.0,2024.0,...,179.0,30.4,19.0,5.0,3.0,8.0,70.0,1261.0,531.0,850.0


### Cleaning offense data

In [91]:
clean_percent_data(offense_dataframe)

In [92]:
remove_special_char_columns(offense_dataframe)

In [93]:
offense_dataframe.head()

Unnamed: 0,team,offense_downs_offense_downs_3rd_att,offense_downs_offense_downs_3rd_md,offense_downs_offense_downs_4th_att,offense_downs_offense_downs_4th_md,offense_downs_offense_downs_rec_1st,offense_downs_offense_downs_rec_1st_perc_,offense_downs_offense_downs_rush_1st,offense_downs_offense_downs_rush_1st_perc_,offense_downs_offense_downs_scrm_plys,...,offense_rushing_offense_rushing_20+,offense_rushing_offense_rushing_40+,offense_rushing_offense_rushing_lng,offense_rushing_offense_rushing_rush_1st,offense_rushing_offense_rushing_rush_1st_perc_,offense_rushing_offense_rushing_rush_fum,offense_scoring_offense_scoring_rsh_td,offense_scoring_offense_scoring_rec_td,offense_scoring_offense_scoring_tot_td,offense_scoring_offense_scoring_2-pt
0,Lions,242,97,16,10,199,0.52,83,0.205,1064,...,9,2,45,83,0.205,6,11,26,41,2
1,Falcons,240,112,15,11,200,0.554,111,0.223,1097,...,12,1,55,111,0.223,4,14,28,47,2
2,Rams,235,78,15,8,179,0.506,84,0.196,1053,...,9,1,42T,84,0.196,2,9,18,27,0
3,Jets,235,93,14,5,171,0.594,118,0.221,1087,...,11,2,53,118,0.221,11,14,20,39,1
4,Dolphins,230,92,10,3,189,0.564,91,0.204,1040,...,5,2,51,91,0.204,12,8,17,26,0


In [94]:
offense_dataframe.describe()

Unnamed: 0,offense_downs_offense_downs_3rd_att,offense_downs_offense_downs_3rd_md,offense_downs_offense_downs_4th_att,offense_downs_offense_downs_4th_md,offense_downs_offense_downs_rec_1st,offense_downs_offense_downs_rec_1st_perc_,offense_downs_offense_downs_rush_1st,offense_downs_offense_downs_rush_1st_perc_,offense_downs_offense_downs_scrm_plys,year,...,offense_rushing_offense_rushing_td,offense_rushing_offense_rushing_20+,offense_rushing_offense_rushing_40+,offense_rushing_offense_rushing_rush_1st,offense_rushing_offense_rushing_rush_1st_perc_,offense_rushing_offense_rushing_rush_fum,offense_scoring_offense_scoring_rsh_td,offense_scoring_offense_scoring_rec_td,offense_scoring_offense_scoring_tot_td,offense_scoring_offense_scoring_2-pt
count,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,...,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0
mean,212.808333,83.325,17.945833,9.10625,193.245833,0.54686,101.2875,0.231104,1036.495833,2017.0,...,13.685417,10.833333,2.066667,101.2875,0.231104,6.58125,13.685417,24.79375,41.479167,1.48125
std,14.505658,11.158312,7.045423,4.290004,29.634129,0.04377,22.166212,0.0317,53.142067,4.325001,...,5.130753,4.726037,1.662472,22.166212,0.0317,2.982493,5.130753,7.594991,9.950167,1.487478
min,176.0,52.0,4.0,0.0,109.0,0.417,56.0,0.159,878.0,2010.0,...,3.0,2.0,0.0,56.0,0.159,0.0,3.0,8.0,17.0,0.0
25%,203.0,76.0,13.0,6.0,171.0,0.517,85.0,0.20875,1002.0,2013.0,...,10.0,7.0,1.0,85.0,0.20875,5.0,10.0,20.0,34.75,0.0
50%,212.0,83.0,16.5,8.0,192.0,0.546,98.0,0.23,1032.0,2017.0,...,13.0,10.0,2.0,98.0,0.23,6.0,13.0,24.0,41.0,1.0
75%,223.0,91.0,22.0,12.0,211.0,0.575,113.0,0.252,1073.0,2021.0,...,17.0,13.0,3.0,113.0,0.252,8.0,17.0,29.0,48.0,2.0
max,257.0,118.0,48.0,24.0,293.0,0.671,188.0,0.318,1191.0,2024.0,...,32.0,33.0,8.0,188.0,0.318,19.0,32.0,55.0,76.0,8.0


### Cleaning special teams data

In [95]:
clean_percent_data(special_teams_dataframe)

In [96]:
remove_special_char_columns(special_teams_dataframe)

In [97]:
special_teams_dataframe.head()

Unnamed: 0,team,special_teams_field_goals_fgm,special_teams_field_goals_att,special_teams_field_goals_fg__perc_,special_teams_field_goals_1-19_>_a-m,special_teams_field_goals_20-29_>_a-m,special_teams_field_goals_30-39_>_a-m,special_teams_field_goals_40-49_>_a-m,special_teams_field_goals_50-59_>_a-m,special_teams_field_goals_60+_>_a-m,...,special_teams_punting_40+,special_teams_punting_lng,special_teams_punting_sck,special_teams_punting_scky,data_fgm,data_fg__perc_,data_xpm,data_xp_pct,data_kret_td,data_pret_t
0,Rams,33,39,0.846,0_0,12_11,14_12,9_7,4_3,0_0,...,4,49,34,244,33,0.846,26,96.3,0,0
1,Raiders,33,41,0.805,0_0,8_8,14_13,12_8,7_4,0_0,...,12,73T,44,291,33,0.805,43,100.0,3,0
2,Eagles,32,38,0.842,0_0,12_12,12_10,11_9,3_1,0_0,...,15,91,49,309,32,0.842,47,100.0,0,1
3,Dolphins,30,41,0.732,1_1,9_9,5_5,18_11,6_3,2_1,...,4,57T,38,228,30,0.732,25,100.0,0,0
4,Jets,30,39,0.769,1_1,11_10,16_14,6_3,4_2,1_0,...,11,74,28,178,30,0.769,37,100.0,2,0


In [98]:
special_teams_dataframe.describe()

Unnamed: 0,special_teams_field_goals_fgm,special_teams_field_goals_att,special_teams_field_goals_fg__perc_,special_teams_field_goals_lng,special_teams_field_goals_fg_blk,year,special_teams_kickoff_returns_avg,special_teams_kickoff_returns_ret,special_teams_kickoff_returns_yds,special_teams_kickoff_returns_kret_td,...,special_teams_punting_20+,special_teams_punting_40+,special_teams_punting_sck,special_teams_punting_scky,data_fgm,data_fg__perc_,data_xpm,data_xp_pct,data_kret_td,data_pret_t
count,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,...,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0
mean,26.597917,31.56875,0.840829,54.333333,0.63125,2017.0,23.029583,35.395833,818.545833,0.258333,...,50.491667,8.785417,38.439583,253.839583,26.597917,0.840829,36.6625,95.895833,0.258333,0.314583
std,5.553572,5.77044,0.071474,3.449204,0.84217,4.325001,2.766705,12.503093,308.174804,0.524885,...,10.021477,3.413377,10.623785,75.569441,5.553572,0.071474,9.878735,4.461721,0.524885,0.580652
min,8.0,16.0,0.444,43.0,0.0,2010.0,14.6,9.0,145.0,0.0,...,28.0,1.0,14.0,63.0,8.0,0.444,16.0,78.8,0.0,0.0
25%,23.0,28.0,0.7985,52.0,0.0,2013.0,21.2,27.0,604.75,0.0,...,43.0,6.0,31.0,196.5,23.0,0.7985,30.0,93.6,0.0,0.0
50%,26.0,31.0,0.846,54.0,0.0,2017.0,22.9,34.0,786.0,0.0,...,51.0,9.0,38.0,246.0,26.0,0.846,36.0,97.0,0.0,0.0
75%,30.0,36.0,0.8935,56.0,1.0,2021.0,24.9,42.0,1003.25,0.0,...,57.0,11.0,46.0,304.0,30.0,0.8935,43.0,100.0,0.0,1.0
max,44.0,52.0,1.0,66.0,5.0,2024.0,32.1,84.0,2084.0,3.0,...,80.0,21.0,85.0,504.0,44.0,1.0,75.0,100.0,3.0,4.0
