In [31]:
import pandas as pd
import numpy as np

In [32]:
msuurl = 'https://www.sports-reference.com/cbb/schools/michigan-state/2017-gamelogs.html#sgl-basic::none'

In [33]:
'''Read team gamelog'''
df = pd.read_html(msuurl)[0]

In [34]:
'''remove oppenent columns'''
df = df.iloc[:, 0:23]

'''Remove divider rows'''
df = df.drop(df.index[[20,21]])

In [35]:
# df.columns

In [36]:
'''Remove Double column headers'''
dubcols = df.columns.tolist()
cols = [col[1] for col in dubcols]
df.columns = cols

In [37]:
df.columns

Index(['G', 'Date', ' ', 'Opp', 'W/L', 'Tm', 'Opp', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF'],
      dtype='object')

In [38]:
'''Rename Columns'''
newcols = ['G', 'Date', 'Blank', 'Opp', 'W', 'Pts', 'PtsA', 'FG', 'FGA', 
           'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'RB', 
           'AST', 'STL', 'BLK', 'TO', 'PF']
df.columns = newcols

In [39]:
def teams_dict(filepath):
    '''
    Create dictionary of school names and formatted school names for mapping
    '''
    team_names = pd.read_csv(filepath)
    team_names = team_names[['School', 'School_format']]
    team_dict = {}
    schools = team_names['School'].tolist()
    schools_format = team_names['School_format'].tolist()
    for school, schform in zip(schools, schools_format):
        team_dict[school] = schform
    return team_dict

In [40]:
'''reformat Opponent team name column strings'''
team_names_sos_filepath = '/Users/sec/galvanize/capstone1/team_list/sos_team_list_2018_final.csv'
df['Opp'] = df['Opp'].map(teams_dict(team_names_sos_filepath))

In [41]:
'''Only take the first charcter in W field then map to 0's and 1's.
        (Ties and overtime have excess characters)'''
df['W'] = df['W'].astype(str).str[0]
df['W'] = df['W'].map({'W': 1, 'L': 0})

In [13]:
'''Create win precentage and rolling average Features'''
df['Ws'] = df['W'].cumsum(axis=0)
df['Wp'] = df['Ws'].astype(int) / df['G'].astype(int)
df['ppg'] = df['Pts'].rolling(window=5,center=False).mean()
df['pApg'] = df['PtsA'].rolling(window=5,center=False).mean()
df['FGp'] = df['FG%'].rolling(window=5,center=False).mean()
df['3Pg'] = df['3P%'].rolling(window=5,center=False).mean()
df['FTp'] = df['FT%'].rolling(window=5,center=False).mean()
df['ORBpg'] = df['ORB'].rolling(window=5,center=False).mean()
df['RBpg'] = df['RB'].rolling(window=5,center=False).mean()
df['ASTpg'] = df['AST'].rolling(window=5,center=False).mean()
df['STLpg'] = df['STL'].rolling(window=5,center=False).mean()
df['BLKpg'] = df['BLK'].rolling(window=5,center=False).mean()
df['TOpg'] = df['TO'].rolling(window=5,center=False).mean()
df['PFpg'] = df['PF'].rolling(window=5,center=False).mean()

In [14]:
'''Remove columns after rolling ave calcs'''
df = df.drop(['G', 'Blank', 'Pts', 'PtsA', 'FG', 'FGA', 'FG%',
              '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'RB', 
              'AST', 'STL', 'BLK', 'TO', 'PF'], axis=1)

In [15]:
df = df.dropna()

In [16]:
'''Add Team Column'''
team = 'michigan-state'
df['Tm'] = team

In [17]:
def sos_dict(filepath):
    '''
    Create dictionary of school names and strength of schedule for mapping
    '''
    team_sos = pd.read_csv(filepath)
    team_sos = team_sos[['School_format', 'SOS']]
    sos_dict = {}
    schools = team_sos['School_format'].tolist()
    sos = team_sos['SOS'].tolist()
    for school, sos in zip(schools, sos):
        sos_dict[school] = sos
    return sos_dict

In [18]:
'''Add SOS columns'''
df['sos'] = df['Tm'].map(sos_dict(team_names_sos_filepath))

In [19]:
df.head()

Unnamed: 0,Date,Opp,W,Ws,Wp,ppg,pApg,FGp,3Pg,FTp,ORBpg,RBpg,ASTpg,STLpg,BLKpg,TOpg,PFpg,Tm,sos
5,2016-11-24,baylor,0,3,0.5,71.4,66.8,0.4722,0.3942,0.5846,7.8,37.0,19.2,2.6,5.2,14.0,17.2,michigan-state,7.58
7,2016-11-29,duke,0,4,0.5,71.0,72.4,0.4726,0.3922,0.6742,6.0,34.8,18.0,2.4,5.0,13.6,19.2,michigan-state,7.58
11,2016-12-18,northeastern,0,7,0.583333,74.0,71.0,0.4782,0.3392,0.5596,9.8,36.8,19.8,4.8,6.0,13.0,19.4,michigan-state,7.58
12,2016-12-21,oakland,1,8,0.615385,75.6,68.4,0.4758,0.3642,0.5466,10.6,36.4,19.6,6.4,7.2,13.0,20.4,michigan-state,7.58
13,2016-12-27,minnesota,1,9,0.642857,74.6,68.0,0.478,0.3498,0.4906,9.8,36.2,19.2,7.4,7.0,13.2,22.8,michigan-state,7.58


Datetime work for tourney v season game labels

In [20]:
from datetime import date
tourney2017start = date(2017,3,14)
tourney2017end = date(2017,4,3)

In [21]:
'''datetime to date'''
df['just_date'] = pd.to_datetime(df['Date']).dt.date

In [22]:
def tourney_game_label(row):
    '''
    Create Column for tourney games
    '''
    
    if row['just_date'] >= tourney2017start and row['just_date'] <= tourney2017end:

        row['game_type'] = 'tourney2017'

    else:
        
        row['game_type'] = 'season'

    return row

In [23]:
df = df.apply(tourney_game_label, axis=1)

In [43]:
df

Unnamed: 0,G,Date,Blank,Opp,W,Pts,PtsA,FG,FGA,FG%,...,FT,FTA,FT%,ORB,RB,AST,STL,BLK,TO,PF
0,1,2016-11-11,N,arizona,0,63,65,25,58,0.431,...,5,9,0.556,10,28,15,4,1,17,23
1,2,2016-11-15,N,kentucky,0,48,69,20,61,0.328,...,3,9,0.333,12,41,9,0,4,19,20
2,3,2016-11-18,,,1,100,53,39,59,0.661,...,6,10,0.6,12,42,33,6,4,14,15
3,4,2016-11-20,,,1,78,77,25,49,0.51,...,19,33,0.576,6,29,21,3,8,12,18
4,5,2016-11-23,N,,1,73,62,25,58,0.431,...,14,22,0.636,6,46,16,1,4,13,19
5,6,2016-11-24,N,baylor,0,58,73,22,51,0.431,...,7,9,0.778,3,27,17,3,6,12,14
6,7,2016-11-25,N,,1,77,72,26,52,0.5,...,14,21,0.667,7,34,20,2,6,14,25
7,8,2016-11-29,@,duke,0,69,78,27,55,0.491,...,10,14,0.714,8,38,16,3,1,17,20
8,9,2016-12-03,,,1,80,76,24,54,0.444,...,25,34,0.735,11,36,19,4,8,10,19
9,10,2016-12-06,,,1,77,57,34,67,0.507,...,1,6,0.167,14,47,28,7,8,12,15
