In [1]:
import pandas as pd
import numpy as np

In [2]:
msuurl = 'https://www.sports-reference.com/cbb/schools/michigan-state/2017-gamelogs.html#sgl-basic::none'

In [3]:
'''Read team gamelog'''
df = pd.read_html(msuurl)[0]

In [4]:
'''remove oppenent columns'''
df = df.iloc[:, 0:23]

'''Remove divider rows'''
df = df.drop(df.index[[20,21]])

In [5]:
'''Remove Double column headers'''
dubcols = df.columns.tolist()
cols = [col[1] for col in dubcols]
df.columns = cols

In [6]:
'''Rename Columns'''
newcols = ['G', 'Date', 'Blank', 'Opp', 'W', 'Pts', 'PtsA', 'FG', 'FGA', 
           'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'RB', 
           'AST', 'STL', 'BLK', 'TO', 'PF']
df.columns = newcols

In [7]:
def teams_dict(filepath):
    '''
    Create dictionary of school names and formatted school names for mapping
    '''
    team_names = pd.read_csv(filepath)
    team_names = team_names[['School', 'School_format']]
    team_dict = {}
    schools = team_names['School'].tolist()
    schools_format = team_names['School_format'].tolist()
    for school, schform in zip(schools, schools_format):
        team_dict[school] = schform
    return team_dict

In [8]:
'''reformat Opponent team name column strings'''
team_names_sos_filepath = '/Users/sec/galvanize/capstone1/team_list/sos_team_list_2018_final.csv'
df['Opp'] = df['Opp'].map(teams_dict(team_names_sos_filepath))

In [9]:
'''Only take the first charcter in W field then map to 0's and 1's.
        (Ties and overtime have excess characters)'''
df['W'] = df['W'].astype(str).str[0]
df['W'] = df['W'].map({'W': 1, 'L': 0})

In [10]:
'''Create win precentage and rolling average Features'''
df['Ws'] = df['W'].cumsum(axis=0)
df['Wp'] = df['Ws'].astype(int) / df['G'].astype(int)
df['ppg'] = df['Pts'].rolling(window=5,center=False).mean()
df['pApg'] = df['PtsA'].rolling(window=5,center=False).mean()
df['FGp'] = df['FG%'].rolling(window=5,center=False).mean()
df['3Pg'] = df['3P%'].rolling(window=5,center=False).mean()
df['FTp'] = df['FT%'].rolling(window=5,center=False).mean()
df['ORBpg'] = df['ORB'].rolling(window=5,center=False).mean()
df['RBpg'] = df['RB'].rolling(window=5,center=False).mean()
df['ASTpg'] = df['AST'].rolling(window=5,center=False).mean()
df['STLpg'] = df['STL'].rolling(window=5,center=False).mean()
df['BLKpg'] = df['BLK'].rolling(window=5,center=False).mean()
df['TOpg'] = df['TO'].rolling(window=5,center=False).mean()
df['PFpg'] = df['PF'].rolling(window=5,center=False).mean()

In [11]:
'''Remove columns after rolling ave calcs'''
df = df.drop(['G', 'Blank', 'Pts', 'PtsA', 'FG', 'FGA', 'FG%',
              '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'RB', 
              'AST', 'STL', 'BLK', 'TO', 'PF'], axis=1)

In [12]:
df = df.dropna()

In [13]:
'''Add Team Column'''
team = 'michigan-state'
df['Tm'] = team

In [14]:
def sos_dict(filepath):
    '''
    Create dictionary of school names and strength of schedule for mapping
    '''
    team_sos = pd.read_csv(filepath)
    team_sos = team_sos[['School_format', 'SOS']]
    sos_dict = {}
    schools = team_sos['School_format'].tolist()
    sos = team_sos['SOS'].tolist()
    for school, sos in zip(schools, sos):
        sos_dict[school] = sos
    return sos_dict

In [15]:
'''Add SOS columns'''
df['sos'] = df['Tm'].map(sos_dict(team_names_sos_filepath))

In [16]:
from datetime import date
tourney2017start = date(2017,3,14)
tourney2017end = date(2017,4,3)

In [17]:
'''datetime to date'''
df['just_date'] = pd.to_datetime(df['Date']).dt.date

In [18]:
def tourney_game_label(row):
    '''
    Create Column for tourney games
    '''
    
    if row['just_date'] >= tourney2017start and row['just_date'] <= tourney2017end:

        row['game_type'] = 'tourney2017'

    else:
        
        row['game_type'] = 'season'

    return row

In [19]:
df = df.apply(tourney_game_label, axis=1)

In [20]:
df.columns

Index(['Date', 'Opp', 'W', 'Ws', 'Wp', 'ppg', 'pApg', 'FGp', '3Pg', 'FTp',
       'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'Tm', 'sos',
       'just_date', 'game_type'],
      dtype='object')

In [21]:
cols_to_shift = ['Ws', 'Wp', 'ppg', 'pApg', 'FGp', '3Pg', 'FTp',
       'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'Tm', 'sos']

In [22]:
def lag_columns(df, cols_to_shift):
    for col in cols_to_shift:
        new_col = '{}_shifted'.format(col)
        df[new_col] = df[col].shift(1)
    df = df.drop(cols_to_shift, axis=1)
    column_names = df.columns.tolist()
    new_column_names = [col.replace('_shifted', '') for col in column_names]
    df.columns = new_column_names
    df = df.dropna()
    return df
    

In [23]:
df = lag_columns(df, cols_to_shift)

In [26]:
df

Unnamed: 0,Date,Opp,W,just_date,game_type,Ws,Wp,ppg,pApg,FGp,...,FTp,ORBpg,RBpg,ASTpg,STLpg,BLKpg,TOpg,PFpg,Tm,sos
5,2016-11-24,baylor,0,2016-11-24,season,,,,,,...,,,,,,,,,,
7,2016-11-29,duke,0,2016-11-29,season,3.0,0.5,71.4,66.8,0.4722,...,0.5846,7.8,37.0,19.2,2.6,5.2,14.0,17.2,michigan-state,7.58
11,2016-12-18,northeastern,0,2016-12-18,season,4.0,0.5,71.0,72.4,0.4726,...,0.6742,6.0,34.8,18.0,2.4,5.0,13.6,19.2,michigan-state,7.58
12,2016-12-21,oakland,1,2016-12-21,season,7.0,0.583333,74.0,71.0,0.4782,...,0.5596,9.8,36.8,19.8,4.8,6.0,13.0,19.4,michigan-state,7.58
13,2016-12-27,minnesota,1,2016-12-27,season,8.0,0.615385,75.6,68.4,0.4758,...,0.5466,10.6,36.4,19.6,6.4,7.2,13.0,20.4,michigan-state,7.58
14,2016-12-30,northwestern,1,2016-12-30,season,9.0,0.642857,74.6,68.0,0.478,...,0.4906,9.8,36.2,19.2,7.4,7.0,13.2,22.8,michigan-state,7.58
15,2017-01-04,rutgers,1,2017-01-04,season,10.0,0.666667,71.4,67.0,0.4624,...,0.5936,9.0,34.8,16.4,7.0,6.2,13.2,23.4,michigan-state,7.58
17,2017-01-11,minnesota,1,2017-01-11,season,11.0,0.6875,75.8,67.4,0.4846,...,0.6256,8.8,34.8,17.8,7.6,7.0,12.8,23.4,michigan-state,7.58
19,2017-01-21,indiana,0,2017-01-21,season,12.0,0.666667,71.4,62.0,0.46,...,0.6804,7.0,34.2,16.6,6.6,6.8,12.6,22.2,michigan-state,7.58
22,2017-01-24,purdue,0,2017-01-24,season,12.0,0.6,72.6,67.6,0.472,...,0.7192,7.0,30.8,16.0,4.8,5.2,13.4,20.2,michigan-state,7.58
