In [1]:
import pandas as pd
import numpy as np

In [2]:
msuurl = 'https://www.sports-reference.com/cbb/schools/michigan-state/2017-gamelogs.html#sgl-basic::none'

In [3]:
'''Read team gamelog'''
df = pd.read_html(msuurl)[0]

In [4]:
'''remove oppenent columns'''
df = df.iloc[:, 0:23]

'''Remove divider rows'''
df = df.drop(df.index[[20,21]])

In [5]:
'''Remove Double column headers'''
dubcols = df.columns.tolist()
cols = [col[1] for col in dubcols]
df.columns = cols

In [6]:
'''Rename Columns'''
newcols = ['G', 'Date', 'Blank', 'Opp', 'W', 'Pts', 'PtsA', 'FG', 'FGA', 
           'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'RB', 
           'AST', 'STL', 'BLK', 'TO', 'PF']
df.columns = newcols

In [7]:
def teams_dict(filepath):
    '''
    Create dictionary of school names and formatted school names for mapping
    '''
    team_names = pd.read_csv(filepath)
    team_names = team_names[['School', 'School_format']]
    team_dict = {}
    schools = team_names['School'].tolist()
    schools_format = team_names['School_format'].tolist()
    for school, schform in zip(schools, schools_format):
        team_dict[school] = schform
    return team_dict

In [8]:
'''reformat Opponent team name column strings'''
team_names_sos_filepath = '/Users/sec/galvanize/capstone1/team_list/sos_team_list_2018_final.csv'
df['Opp'] = df['Opp'].map(teams_dict(team_names_sos_filepath))

In [9]:
'''Only take the first charcter in W field then map to 0's and 1's.
        (Ties and overtime have excess characters)'''
df['W'] = df['W'].astype(str).str[0]
df['W'] = df['W'].map({'W': 1, 'L': 0})

In [10]:
'''Create win precentage and rolling average Features'''
df['Ws'] = df['W'].cumsum(axis=0)
df['Wp'] = df['Ws'].astype(int) / df['G'].astype(int)
df['ppg'] = df['Pts'].rolling(window=5,center=False).mean()
df['pApg'] = df['PtsA'].rolling(window=5,center=False).mean()
df['FGp'] = df['FG%'].rolling(window=5,center=False).mean()
df['3Pg'] = df['3P%'].rolling(window=5,center=False).mean()
df['FTp'] = df['FT%'].rolling(window=5,center=False).mean()
df['ORBpg'] = df['ORB'].rolling(window=5,center=False).mean()
df['RBpg'] = df['RB'].rolling(window=5,center=False).mean()
df['ASTpg'] = df['AST'].rolling(window=5,center=False).mean()
df['STLpg'] = df['STL'].rolling(window=5,center=False).mean()
df['BLKpg'] = df['BLK'].rolling(window=5,center=False).mean()
df['TOpg'] = df['TO'].rolling(window=5,center=False).mean()
df['PFpg'] = df['PF'].rolling(window=5,center=False).mean()

In [11]:
'''Remove columns after rolling ave calcs'''
df = df.drop(['G', 'Blank', 'Pts', 'PtsA', 'FG', 'FGA', 'FG%',
              '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'RB', 
              'AST', 'STL', 'BLK', 'TO', 'PF'], axis=1)

In [12]:
df = df.dropna()

In [13]:
'''Add Team Column'''
team = 'michigan-state'
df['Tm'] = team

In [14]:
def sos_dict(filepath):
    '''
    Create dictionary of school names and strength of schedule for mapping
    '''
    team_sos = pd.read_csv(filepath)
    team_sos = team_sos[['School_format', 'SOS']]
    sos_dict = {}
    schools = team_sos['School_format'].tolist()
    sos = team_sos['SOS'].tolist()
    for school, sos in zip(schools, sos):
        sos_dict[school] = sos
    return sos_dict

In [15]:
'''Add SOS columns'''
df['sos'] = df['Tm'].map(sos_dict(team_names_sos_filepath))

In [16]:
from datetime import date
tourney2017start = date(2017,3,14)
tourney2017end = date(2017,4,3)

In [17]:
'''datetime to date'''
df['just_date'] = pd.to_datetime(df['Date']).dt.date

In [18]:
def tourney_game_label(row):
    '''
    Create Column for tourney games
    '''
    
    if row['just_date'] >= tourney2017start and row['just_date'] <= tourney2017end:

        row['game_type'] = 'tourney2017'

    else:
        
        row['game_type'] = 'season'

    return row

In [19]:
df = df.apply(tourney_game_label, axis=1)

In [20]:
df.columns

Index(['Date', 'Opp', 'W', 'Ws', 'Wp', 'ppg', 'pApg', 'FGp', '3Pg', 'FTp',
       'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'Tm', 'sos',
       'just_date', 'game_type'],
      dtype='object')

In [21]:
cols_to_shift = ['Ws', 'Wp', 'ppg', 'pApg', 'FGp', '3Pg', 'FTp',
       'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'Tm', 'sos']

In [22]:
def lag_columns(df, cols_to_shift):
    for col in cols_to_shift:
        new_col = '{}_shifted'.format(col)
        df[new_col] = df[col].shift(1)
    df = df.drop(cols_to_shift, axis=1)
    column_names = df.columns.tolist()
    new_column_names = [col.replace('_shifted', '') for col in column_names]
    df.columns = new_column_names
    df = df.dropna()
    return df
    

In [23]:
df = lag_columns(df, cols_to_shift)

In [25]:
df.columns

Index(['Date', 'Opp', 'W', 'just_date', 'game_type', 'Ws', 'Wp', 'ppg', 'pApg',
       'FGp', '3Pg', 'FTp', 'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg',
       'PFpg', 'Tm', 'sos'],
      dtype='object')