# Scraping MLS stats for each year

# Notebook scraps data from https://www.mlssoccer.com/stats for each team by year and exports the output to a set of csv files.

### The Notebook exports Team statistics for MLS Seasons 1996-2018.
#### The Output is in two sets - One for each season(for team specific analysis) and second is a combined dataset for all seasons combined for overall analysis

In [179]:
# Dependency

from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse
import numpy as np

In [187]:
# Building the url and arguements

domain = 'https://www.mlssoccer.com/stats/team?'
years = np.arange(1996,2019,1)
season_type = 'REG'
#getVars= {'year': year, 'season_type': season_type}

urls = [(domain + urllib.parse.urlencode({'year': year, 'season_type': season_type})) for year in years]
print(urls)

['https://www.mlssoccer.com/stats/team?year=1996&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=1997&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=1998&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=1999&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2000&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2001&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2002&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2003&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2004&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2005&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2006&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2007&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2008&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2009&season_type=REG', 'https://www.mlssoccer.com/stats/team?year=2010&season_type=REG', 'https://

In [191]:
# Building a list of each response for each year
responses = [requests.get(url) for url in urls]

In [193]:
# Build BeautifulSoup to parse the raw html and return it as a nested data structure for year year (1996-2019)

soup = [BeautifulSoup(response.text,'html.parser') for response in response]

In [194]:
len(soup)

23

In [195]:
# Get Variable Definations to be used as glossary

stats_def = soup[0].find('div',class_='stats_legend')
print(stats_def.prettify())

<div class="stats_legend">
 <strong>
  GP:
 </strong>
 Games Played,
 <strong>
  GS:
 </strong>
 Games Started,
 <strong>
  G:
 </strong>
 Goals,
 <strong>
  MIN:
 </strong>
 Minutes Played,
 <strong>
  A:
 </strong>
 Assists,
 <strong>
  SHT:
 </strong>
 Shots,
 <strong>
  SOG:
 </strong>
 Shots on Goal,
 <strong>
  FC:
 </strong>
 Fouls Committed,
 <strong>
  FS:
 </strong>
 Fouls Suffered,
 <strong>
  Y:
 </strong>
 Yellow Cards,
 <strong>
  R:
 </strong>
 Red Cards,
 <strong>
  GF:
 </strong>
 Goals For,
 <strong>
  GA:
 </strong>
 Goals Against,
 <strong>
  SO:
 </strong>
 Shutouts,
 <strong>
  SV:
 </strong>
 Saves,
 <strong>
  CK:
 </strong>
 Corner Kicks,
 <strong>
  PKA:
 </strong>
 Penalty Kick Attempts,
 <strong>
  PKG:
 </strong>
 Penalty Kick Goals,
 <strong>
  PKS:
 </strong>
 Penalty Kick Saves,
 <strong>
  OFF:
 </strong>
 Offsides
</div>



In [196]:
stats_def.text

' GP: Games Played, GS: Games Started, G: Goals, MIN: Minutes Played, A: Assists, SHT: Shots, SOG: Shots on Goal, FC: Fouls Committed, FS: Fouls Suffered, Y: Yellow Cards, R: Red Cards, GF: Goals For, GA: Goals Against, SO: Shutouts, SV: Saves, CK: Corner Kicks, PKA: Penalty Kick Attempts, PKG: Penalty Kick Goals, PKS: Penalty Kick Saves, OFF: Offsides'

In [197]:
# Split each term from the html into a list of List items
glossary = [item.split(': ') for item in stats_def.text.split(', ')]

In [285]:
# Store glossary terms in a dataframe. Export the glossary to csv
glossary_df = pd.DataFrame(glossary,columns=['ID','Term'])
glossary_df.to_csv('data/stats_glossary.csv')
glossary_df

Unnamed: 0,ID,Term
0,GP,Games Played
1,GS,Games Started
2,G,Goals
3,MIN,Minutes Played
4,A,Assists
5,SHT,Shots
6,SOG,Shots on Goal
7,FC,Fouls Committed
8,FS,Fouls Suffered
9,Y,Yellow Cards


In [283]:
#rows = soup.find_all('tr', class_='odd')
df = pd.DataFrame()
index = -1
year_index = 0
teams=[]



In [284]:
for year in soup:
    rows = year.find_all('td')
    df_year = pd.DataFrame()
    for row in rows:
        if row['data-title']=='club':
            index = index+1
        df_year.at[index,row['data-title']] = row.text
        df_year.at[index,'year'] = years[year_index]
        df_year.to_csv('data/team_season' + str(years[year_index]) + '.csv')

    print("______NEXT YEAR_____")
    print(df_year)
    print(df)
    df = df.append(df_year,ignore_index=True)
    index=0
    year_index = year_index+1

______NEXT YEAR_____
                     club    year  gp   g   a shts  sog   fc   fs  off   ck  \
0              MetroStars  1996.0  32  45  43  419  203  430  452  118  172   
1     Kansas City Wizards  1996.0  32  61  59  514  250  412  404   94  144   
2         Colorado Rapids  1996.0  32  44  41  435  201  394  389  105  192   
3           Columbus Crew  1996.0  32  59  53  406  202  415  520   99  170   
4  New England Revolution  1996.0  32  43  42  383  174  514  436  117  130   
5          San Jose Clash  1996.0  32  50  54  420  209  515  485  137  180   
6      Los Angeles Galaxy  1996.0  32  59  46  442  221  461  415   94  187   
7             D.C. United  1996.0  32  62  65  436  223  417  403   89  194   
8        Tampa Bay Mutiny  1996.0  32  66  60  377  185  449  410  147  140   
9             Dallas Burn  1996.0  32  50  42  411  224  425  472   98  152   

  pkg pka  
0   2   4  
1   7   8  
2   3   3  
3   6   7  
4   6   7  
5   3   3  
6   4   6  
7   4   4  
8

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1               MetroStars  2000.0  32  64  72  437  207  593  520  108  137   
2      Kansas City Wizards  2000.0  32  47  67  452  175  503  541   96  151   
3          Colorado Rapids  2000.0  32  43  57  482  196  581  526  116  173   
4            Columbus Crew  2000.0  32  48  57  429  184  498  543   78  176   
5   New England Revolution  2000.0  32  47  50  441  209  569  526  109  173   
6     San Jose Earthquakes  2000.0  32  35  41  412  192  527  473  113  170   
7             Chicago Fire  2000.0  32  67  85  488  230  545  564   75  182   
8       Los Angeles Galaxy  2000.0  32  47  47  472  202  516  504  113  186   
9              D.C. United  2000.0  32  44  42  472  214  618  531  111  263   
10        Tampa Bay Mutiny  2000.0  32  62  94  457  202  434  446  130  147   
11             Dallas Burn  2000.0  32  54  79  392  199  588  521  103  143   
12            Miami

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs off   ck  \
1               MetroStars  2002.0  28  41  52  383  174  445  515  87  151   
2      Kansas City Wizards  2002.0  28  37  49  403  176  362  442  92  152   
3          Colorado Rapids  2002.0  28  43  63  369  161  469  390  87  129   
4            Columbus Crew  2002.0  28  44  60  375  155  482  470  93  126   
5   New England Revolution  2002.0  28  49  62  368  184  463  425  64  145   
6     San Jose Earthquakes  2002.0  28  45  59  372  177  448  383  75  158   
7             Chicago Fire  2002.0  28  43  54  380  165  472  479  73  147   
8       Los Angeles Galaxy  2002.0  28  44  68  401  198  456  407  68  161   
9              D.C. United  2002.0  28  31  40  334  164  492  442  99  185   
10             Dallas Burn  2002.0  28  44  58  403  182  470  411  91  171   

   pkg pka  
1    6   9  
2    4   6  
3    6   6  
4    3   3  
5    3   4  
6    2   3  
7    2   4  
8    

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1               MetroStars  2004.0  30  47  40  456  193  430  500   48  151   
2      Kansas City Wizards  2004.0  30  38  34  368  162  405  429   67  107   
3          Colorado Rapids  2004.0  30  29  35  396  165  463  385  102  155   
4            Columbus Crew  2004.0  30  40  32  335  147  464  398   75  134   
5   New England Revolution  2004.0  30  42  47  364  177  406  477   72  175   
6     San Jose Earthquakes  2004.0  30  41  45  373  158  460  499   72  177   
7             Chicago Fire  2004.0  30  36  40  381  150  420  504   79  147   
8       Los Angeles Galaxy  2004.0  30  42  40  347  156  484  469   75  166   
9              D.C. United  2004.0  30  43  48  365  187  542  410   75  180   
10             Dallas Burn  2004.0  30  34  38  407  160  499  373   71  162   

   pkg pka  
1    8   8  
2    3   5  
3    2   3  
4    8   8  
5    4   6  
6    3   5  
7    4 

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1       New York Red Bulls  2006.0  32  41  38  392  170  540  450  109  130   
2      Kansas City Wizards  2006.0  32  43  35  408  180  399  460   86  168   
3          Colorado Rapids  2006.0  32  36  38  364  149  420  402  100  173   
4            Columbus Crew  2006.0  32  30  24  358  145  512  468  117  158   
5   New England Revolution  2006.0  32  39  42  382  167  422  515   94  156   
6             Chicago Fire  2006.0  32  43  41  345  155  457  466   84  142   
7       Los Angeles Galaxy  2006.0  32  37  39  335  144  532  501   86  142   
8              D.C. United  2006.0  32  52  49  379  194  454  433   91  144   
9           Houston Dynamo  2006.0  32  44  41  348  158  502  530  101  154   
10          Real Salt Lake  2006.0  32  45  42  355  147  422  416  123  125   
11               FC Dallas  2006.0  32  48  51  398  168  476  419  105  112   
12              Chi

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1       New York Red Bulls  2008.0  30  42  38  356  158  413  341   82  145   
2      Kansas City Wizards  2008.0  30  37  38  355  137  321  379  103  144   
3          Colorado Rapids  2008.0  30  44  38  397  162  346  376   68  155   
4            Columbus Crew  2008.0  30  50  45  388  161  326  401   73  173   
5   New England Revolution  2008.0  30  40  29  323  157  358  417   49  144   
6     San Jose Earthquakes  2008.0  30  32  33  317  132  382  353   78  154   
7             Chicago Fire  2008.0  30  44  43  335  162  333  390   72  123   
8       Los Angeles Galaxy  2008.0  30  55  55  353  161  381  350   87  110   
9              D.C. United  2008.0  30  43  47  352  152  379  377   78  166   
10          Houston Dynamo  2008.0  30  45  44  367  156  375  385  110  160   
11          Real Salt Lake  2008.0  30  40  35  429  178  388  355   67  148   
12               FC

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1       New York Red Bulls  2010.0  30  38  31  365  137  392  278  107  143   
2      Kansas City Wizards  2010.0  30  36  32  393  169  341  374   69  150   
3          Colorado Rapids  2010.0  30  44  35  354  141  293  321   67  128   
4            Columbus Crew  2010.0  30  40  26  364  137  314  364  103  141   
5   New England Revolution  2010.0  30  32  26  326  132  328  336   78  128   
6     San Jose Earthquakes  2010.0  30  34  35  329  140  286  305   83  126   
7             Chicago Fire  2010.0  30  37  37  295  126  296  348   84  129   
8       Los Angeles Galaxy  2010.0  30  44  46  361  146  356  295   68  140   
9              D.C. United  2010.0  30  21   9  337  134  322  353   71  144   
10          Houston Dynamo  2010.0  30  40  44  326  122  299  330  107  161   
11          Real Salt Lake  2010.0  30  45  39  340  135  335  348   95  133   
12               FC

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1       New York Red Bulls  2012.0  34  57  56  394  153  410  351   77  148   
2     Sporting Kansas City  2012.0  34  42  47  542  158  456  414   54  199   
3          Colorado Rapids  2012.0  34  44  41  459  162  415  426   62  145   
4            Columbus Crew  2012.0  34  44  39  466  151  412  393  111  149   
5   New England Revolution  2012.0  34  39  31  455  178  360  483   68  154   
6     San Jose Earthquakes  2012.0  34  72  73  517  192  440  410   91  185   
7             Chicago Fire  2012.0  34  46  39  460  163  349  421   75  158   
8                LA Galaxy  2012.0  34  59  61  480  179  421  391   67  188   
9              D.C. United  2012.0  34  53  52  451  163  390  390  106  157   
10        Portland Timbers  2012.0  34  34  28  421  133  388  403   86  172   
11         Montreal Impact  2012.0  34  45  43  443  156  427  417  114  143   
12  Vancouver White

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs off   ck  \
1       New York Red Bulls  2014.0  34  55  54  402  158  400  382  87  166   
2     Sporting Kansas City  2014.0  34  48  37  468  160  499  492  79  180   
3          Colorado Rapids  2014.0  34  43  28  476  166  392  379  88  173   
4            Columbus Crew  2014.0  34  52  46  474  161  411  391  80  177   
5   New England Revolution  2014.0  34  51  49  468  180  427  442  54  209   
6     San Jose Earthquakes  2014.0  34  35  26  390  126  413  450  53  118   
7             Chicago Fire  2014.0  34  41  29  423  148  448  448  66  173   
8                LA Galaxy  2014.0  34  69  75  543  201  394  415  55  174   
9              D.C. United  2014.0  34  52  49  366  144  429  397  85  136   
10        Portland Timbers  2014.0  34  61  50  488  183  451  526  96  176   
11         Montreal Impact  2014.0  34  38  38  393  141  457  383  89  146   
12  Vancouver Whitecaps FC  201

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs  off   ck  \
1       New York Red Bulls  2016.0  34  61  64  441  174  450  449   78  185   
2     Sporting Kansas City  2016.0  34  42  46  478  146  420  415   62  198   
3          Colorado Rapids  2016.0  34  39  44  435  151  458  341   74  131   
4         Columbus Crew SC  2016.0  34  50  52  463  167  312  376  104  200   
5   New England Revolution  2016.0  34  44  45  474  163  426  428   72  174   
6     San Jose Earthquakes  2016.0  34  32  29  409  128  413  482   69  169   
7             Chicago Fire  2016.0  34  42  41  394  125  412  398   88  157   
8                LA Galaxy  2016.0  34  54  59  358  142  361  394   59  138   
9              D.C. United  2016.0  34  53  55  471  163  507  419   95  154   
10        Portland Timbers  2016.0  34  48  32  449  154  438  466   77  162   
11         Montreal Impact  2016.0  34  49  44  440  161  431  425   66  160   
12  Vancouver White

______NEXT YEAR_____
                      club    year  gp   g   a shts  sog   fc   fs off   ck  \
1             NY Red Bulls  2018.0  34  62  72  456  186  398  406  88  183   
2     Sporting Kansas City  2018.0  34  65  69  583  207  376  377  38  206   
3          Colorado Rapids  2018.0  34  36  32  380  124  465  409  69  154   
4         Columbus Crew SC  2018.0  34  43  43  497  155  365  362  82  215   
5   New England Revolution  2018.0  34  49  44  486  179  513  385  44  224   
6     San Jose Earthquakes  2018.0  34  49  34  439  149  378  445  62  174   
7             Chicago Fire  2018.0  34  48  48  341  129  413  324  85  159   
8                LA Galaxy  2018.0  34  66  69  425  160  423  351  81  176   
9              D.C. United  2018.0  34  60  61  396  149  377  421  49  147   
10        Portland Timbers  2018.0  34  54  44  466  171  369  392  67  162   
11         Montreal Impact  2018.0  34  47  44  424  138  375  397  86  151   
12  Vancouver Whitecaps FC  201

In [282]:
# Export Overall Data for all seasons combined
df.to_csv('data/Team_Season_Data_combined.csv')

In [278]:
# Display by each year
df.set_index(['year','club'])

Unnamed: 0_level_0,Unnamed: 1_level_0,gp,g,a,shts,sog,fc,fs,off,ck,pkg,pka
year,club,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1996.0,MetroStars,32,45,43,419,203,430,452,118,172,2,4
1996.0,Kansas City Wizards,32,61,59,514,250,412,404,94,144,7,8
1996.0,Colorado Rapids,32,44,41,435,201,394,389,105,192,3,3
1996.0,Columbus Crew,32,59,53,406,202,415,520,99,170,6,7
1996.0,New England Revolution,32,43,42,383,174,514,436,117,130,6,7
1996.0,San Jose Clash,32,50,54,420,209,515,485,137,180,3,3
1996.0,Los Angeles Galaxy,32,59,46,442,221,461,415,94,187,4,6
1996.0,D.C. United,32,62,65,436,223,417,403,89,194,4,4
1996.0,Tampa Bay Mutiny,32,66,60,377,185,449,410,147,140,6,6
1996.0,Dallas Burn,32,50,42,411,224,425,472,98,152,3,3
