In [78]:
import numpy as np
import pandas as pd
import os

In [79]:
dataset_name = 'nfl_elo_game'

In [80]:
input_dir = './data'
inp_fname = 'nfl_games.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [81]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), parse_dates=['date'])
print(data.shape)
data.head()

(16810, 12)


Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1920-09-26,1920,0,0,RII,STP,1503.947,1300.0,0.824651,48,0,1.0
1,1920-10-03,1920,0,0,AKR,WHE,1503.42,1300.0,0.824212,43,0,1.0
2,1920-10-03,1920,0,0,RCH,ABU,1503.42,1300.0,0.824212,10,0,1.0
3,1920-10-03,1920,0,0,DAY,COL,1493.002,1504.908,0.575819,14,0,1.0
4,1920-10-03,1920,0,0,RII,MUN,1516.108,1478.004,0.644171,45,0,1.0


In [82]:
id_col = "game_id"
target_col = "score2"

# Prepare Dataset

In [83]:
# Filter rows where season is after 1950
data = data[data["season"] >= 1951].copy()
data.shape

(14567, 12)

In [84]:
# Add month of year column (so we could model the "winter" effect)
data.insert(3, "month", data['date'].dt.month)

In [85]:
# drop unwanted columns 
data.drop(columns=["date", "result1", "elo_prob1"], inplace=True)

In [86]:
data.head()

Unnamed: 0,season,neutral,month,playoff,team1,team2,elo1,elo2,score1,score2
2243,1951,0,9,0,LAR,NYY,1598.581,1454.772,54,14
2244,1951,0,9,0,SF,CLE,1455.94,1661.753,24,10
2245,1951,0,9,0,DET,WSH,1487.382,1437.049,35,17
2246,1951,0,9,0,GB,CHI,1364.142,1582.697,20,31
2247,1951,0,9,0,ARI,PHI,1505.582,1584.952,14,17


# Insert Id Column

In [87]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

      game_id  season  neutral  month  playoff team1 team2      elo1  \
2243        0    1951        0      9        0   LAR   NYY  1598.581   
2244        1    1951        0      9        0    SF   CLE  1455.940   
2245        2    1951        0      9        0   DET   WSH  1487.382   
2246        3    1951        0      9        0    GB   CHI  1364.142   
2247        4    1951        0      9        0   ARI   PHI  1505.582   

          elo2  score1  score2  
2243  1454.772      54      14  
2244  1661.753      24      10  
2245  1437.049      35      17  
2246  1582.697      20      31  
2247  1584.952      14      17  


In [90]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14567 entries, 2243 to 16809
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   game_id  14567 non-null  object 
 1   season   14567 non-null  int64  
 2   neutral  14567 non-null  int64  
 3   month    14567 non-null  int64  
 4   playoff  14567 non-null  int64  
 5   team1    14567 non-null  object 
 6   team2    14567 non-null  object 
 7   elo1     14567 non-null  float64
 8   elo2     14567 non-null  float64
 9   score1   14567 non-null  int64  
 10  score2   14567 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 1.3+ MB


# Save Main Data File

In [89]:
data.to_csv(outp_fname, index=False)