<a href="https://colab.research.google.com/github/rldrobinson/wtcs_python_project/blob/main/WeTeach_Python_Day_3_Lesson_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WeTeach_Python Day 3 Lesson 1: More Fun with APIs and Data Analysis


In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

We'll begin by using the same API as in Day 2 Lesson 2:

In [None]:
url = "http://lookup-service-prod.mlb.com/"

In [None]:
teams = requests.get(url + "json/named.team_all_season.bam?sport_code='mlb'&all_star_sw='N'&sort_order='name_asc'&season=2019")
teams

<Response [200]>

In [None]:
team_data = teams.json()
team_list = team_data['team_all_season']['queryResults']['row']
team_data['team_all_season']['queryResults']['row'][0].keys()

dict_keys(['phone_number', 'venue_name', 'franchise_code', 'all_star_sw', 'sport_code', 'address_city', 'city', 'name_display_full', 'spring_league_abbrev', 'time_zone_alt', 'sport_id', 'venue_id', 'mlb_org_id', 'time_zone_generic', 'mlb_org', 'last_year_of_play', 'league_full', 'home_opener_time', 'address_province', 'league_id', 'name_abbrev', 'bis_team_code', 'league', 'spring_league', 'base_url', 'address_zip', 'sport_code_display', 'mlb_org_short', 'time_zone', 'address_line1', 'mlb_org_brief', 'address_line2', 'season', 'address_line3', 'division_abbrev', 'name_display_short', 'team_id', 'active_sw', 'address_intl', 'state', 'address_country', 'mlb_org_abbrev', 'division', 'team_code', 'name', 'website_url', 'sport_code_name', 'first_year_of_play', 'league_abbrev', 'name_display_long', 'store_url', 'time_zone_text', 'name_short', 'home_opener', 'address_state', 'division_full', 'time_zone_num', 'spring_league_full', 'address', 'name_display_brief', 'file_code', 'division_id', 'sp

In this cell, we're pulling all the players from the Padre's 1998 team and adding their player dictionaries to a list.

In [None]:
batting_stats = []
team_id = [135]

for index, teamid in enumerate(team_id):
  players = requests.get(url + "json/named.roster_team_alltime.bam?start_season=1998&end_season=1998&team_id={}".format(teamid))
  player_list = players.json()['roster_team_alltime']['queryResults']['row']

  for player in player_list:
    hitting_stats = requests.get(url + "json/named.sport_career_hitting.bam?league_list_id='mlb'&game_type='R'&player_id={}".format(player['player_id']))
    hitting_dict = hitting_stats.json()['sport_career_hitting']['queryResults']['row']
    hitting_dict['name'] = player['name_last_first']
    try:
      batting_stats.append(hitting_dict)
    except KeyError:
      pass

Since we now have a list of batting statistics composed of dictionaries with all the same keys, we can directly create a pandas data frame from that. 

In [None]:
batting_df = pd.DataFrame(batting_stats)

# In this line, we replace the empty strings with NaN (Not a Number)
batting_df_no_empty = batting_df.replace("", np.nan, regex=True)

df_converted = batting_df_no_empty.copy()
for column in batting_df_no_empty.columns:
  try:
    # Here, we convert the appropriate columns to number types
    df_converted[column] = pd.to_numeric(batting_df_no_empty[column])
  except ValueError:
    pass

df_converted

Unnamed: 0,hr,gidp,np,sac,team_count,sport_code,hgnd,rbi,lob,babip,gidp_opp,tb,xbh,sport_id,bb,avg,slg,ops,hbp,g,d,hfly,so,wo,sport,sf,hpop,tpa,h,cs,obp,hldr,t,ao,r,go_ao,sb,ppa,player_id,ab,ibb,roe,go,name
0,14,14,,6,2,mlb,16.0,55,77.0,0.29,28.0,178,34,1,25,0.238,0.371,0.649,2,173,18,12.0,121,0.0,MLB,0,0.0,513,114,0,0.278,12.0,2,35,46,0.97,2,0.0,110313,480,2,0.0,34.0,"Arias, George"
1,1,5,280.0,83,5,mlb,11.0,26,103.0,0.228,43.0,86,14,1,16,0.134,0.165,0.325,0,306,13,3.0,218,0.0,MLB,1,0.0,621,70,0,0.16,10.0,0,33,26,2.39,1,0.45,110351,521,0,3.0,79.0,"Ashby, Andy"
2,0,2,19.0,3,4,mlb,0.0,2,11.0,0.133,5.0,3,1,1,3,0.065,0.097,0.244,0,323,1,1.0,16,0.0,MLB,0,0.0,37,2,0,0.147,1.0,0,0,0,13.0,0,0.51,111145,31,0,0.0,13.0,"Boehringer, Brian"
3,2,2,430.0,54,6,mlb,7.0,29,128.0,0.201,54.0,80,12,1,19,0.129,0.162,0.331,5,452,10,6.0,187,0.0,MLB,2,1.0,575,64,0,0.169,15.0,0,65,20,1.62,0,0.75,111554,495,0,4.0,105.0,"Brown, Kevin"
4,0,0,,0,4,mlb,0.0,0,0.0,0.125,1.0,2,1,1,0,0.1,0.2,0.3,0,106,1,0.0,2,0.0,MLB,0,0.0,10,1,0,0.1,0.0,0,0,0,1.0,0,0.0,111636,10,0,0.0,1.0,"Bruske, Jim"
5,239,152,1532.0,14,4,mlb,72.0,983,382.0,0.297,150.0,2809,604,1,727,0.272,0.447,0.794,29,1760,348,57.0,1163,0.0,MLB,67,2.0,7125,1710,39,0.347,91.0,17,221,894,1.03,88,0.22,111908,6288,112,8.0,228.0,"Caminiti, Ken"
6,34,34,,7,2,mlb,,185,,0.306,,483,100,1,80,0.241,0.379,0.67,16,500,59,,360,,MLB,13,,1392,308,3,0.292,,7,0,136,,16,0.0,112283,1276,3,,,"Cianfrocco, Archi"
7,0,2,895.0,43,4,mlb,20.0,12,196.0,0.189,53.0,40,6,1,14,0.095,0.115,0.254,4,188,5,1.0,174,0.0,MLB,1,0.0,410,33,0,0.139,12.0,1,31,22,4.9,0,2.18,136725,348,0,10.0,152.0,"Clement, Matt"
8,0,0,39.0,1,4,mlb,0.0,4,10.0,0.269,2.0,10,2,1,3,0.2,0.286,0.549,0,183,1,0.0,9,0.0,MLB,0,0.0,39,7,0,0.263,2.0,1,5,6,1.8,0,1.0,112940,35,0,0.0,9.0,"Cunnane, Will"
9,38,40,4621.0,7,3,mlb,112.0,204,705.0,0.289,238.0,553,117,1,153,0.237,0.366,0.672,7,486,77,72.0,385,1.0,MLB,19,4.0,1698,358,8,0.306,170.0,2,386,168,1.06,9,2.72,150433,1512,12,19.0,408.0,"Davis, Ben"


We can look at the data types by using the `dtypes` attribute:

In [None]:
display(df_converted.dtypes)

hr              int64
gidp            int64
np            float64
sac             int64
team_count      int64
sport_code     object
hgnd          float64
rbi             int64
lob           float64
babip         float64
gidp_opp      float64
tb              int64
xbh             int64
sport_id        int64
bb              int64
avg           float64
slg           float64
ops           float64
hbp             int64
g               int64
d               int64
hfly          float64
so              int64
wo            float64
sport          object
sf              int64
hpop          float64
tpa             int64
h               int64
cs              int64
obp           float64
hldr          float64
t               int64
ao              int64
r               int64
go_ao         float64
sb              int64
ppa           float64
player_id       int64
ab              int64
ibb             int64
roe           float64
go            float64
name           object
dtype: object

One column, `np`, has a lot of `NaN` values, so lets drop that one to preserve the most of our data.

In [None]:
df_converted = df_converted.drop('np', axis='columns')
df_converted

Unnamed: 0,hr,gidp,sac,team_count,sport_code,hgnd,rbi,lob,babip,gidp_opp,tb,xbh,sport_id,bb,avg,slg,ops,hbp,g,d,hfly,so,wo,sport,sf,hpop,tpa,h,cs,obp,hldr,t,ao,r,go_ao,sb,ppa,player_id,ab,ibb,roe,go,name
0,14,14,6,2,mlb,16.0,55,77.0,0.29,28.0,178,34,1,25,0.238,0.371,0.649,2,173,18,12.0,121,0.0,MLB,0,0.0,513,114,0,0.278,12.0,2,35,46,0.97,2,0.0,110313,480,2,0.0,34.0,"Arias, George"
1,1,5,83,5,mlb,11.0,26,103.0,0.228,43.0,86,14,1,16,0.134,0.165,0.325,0,306,13,3.0,218,0.0,MLB,1,0.0,621,70,0,0.16,10.0,0,33,26,2.39,1,0.45,110351,521,0,3.0,79.0,"Ashby, Andy"
2,0,2,3,4,mlb,0.0,2,11.0,0.133,5.0,3,1,1,3,0.065,0.097,0.244,0,323,1,1.0,16,0.0,MLB,0,0.0,37,2,0,0.147,1.0,0,0,0,13.0,0,0.51,111145,31,0,0.0,13.0,"Boehringer, Brian"
3,2,2,54,6,mlb,7.0,29,128.0,0.201,54.0,80,12,1,19,0.129,0.162,0.331,5,452,10,6.0,187,0.0,MLB,2,1.0,575,64,0,0.169,15.0,0,65,20,1.62,0,0.75,111554,495,0,4.0,105.0,"Brown, Kevin"
4,0,0,0,4,mlb,0.0,0,0.0,0.125,1.0,2,1,1,0,0.1,0.2,0.3,0,106,1,0.0,2,0.0,MLB,0,0.0,10,1,0,0.1,0.0,0,0,0,1.0,0,0.0,111636,10,0,0.0,1.0,"Bruske, Jim"
5,239,152,14,4,mlb,72.0,983,382.0,0.297,150.0,2809,604,1,727,0.272,0.447,0.794,29,1760,348,57.0,1163,0.0,MLB,67,2.0,7125,1710,39,0.347,91.0,17,221,894,1.03,88,0.22,111908,6288,112,8.0,228.0,"Caminiti, Ken"
6,34,34,7,2,mlb,,185,,0.306,,483,100,1,80,0.241,0.379,0.67,16,500,59,,360,,MLB,13,,1392,308,3,0.292,,7,0,136,,16,0.0,112283,1276,3,,,"Cianfrocco, Archi"
7,0,2,43,4,mlb,20.0,12,196.0,0.189,53.0,40,6,1,14,0.095,0.115,0.254,4,188,5,1.0,174,0.0,MLB,1,0.0,410,33,0,0.139,12.0,1,31,22,4.9,0,2.18,136725,348,0,10.0,152.0,"Clement, Matt"
8,0,0,1,4,mlb,0.0,4,10.0,0.269,2.0,10,2,1,3,0.2,0.286,0.549,0,183,1,0.0,9,0.0,MLB,0,0.0,39,7,0,0.263,2.0,1,5,6,1.8,0,1.0,112940,35,0,0.0,9.0,"Cunnane, Will"
9,38,40,7,3,mlb,112.0,204,705.0,0.289,238.0,553,117,1,153,0.237,0.366,0.672,7,486,77,72.0,385,1.0,MLB,19,4.0,1698,358,8,0.306,170.0,2,386,168,1.06,9,2.72,150433,1512,12,19.0,408.0,"Davis, Ben"


Some of the players have a lot of `NaN` values. Lets remove them to only deal with the players with complete data.

In [None]:
df_no_na = df_converted.dropna()

In [None]:
df_no_na

Unnamed: 0,hr,gidp,sac,team_count,sport_code,hgnd,rbi,lob,babip,gidp_opp,tb,xbh,sport_id,bb,avg,slg,ops,hbp,g,d,hfly,so,wo,sport,sf,hpop,tpa,h,cs,obp,hldr,t,ao,r,go_ao,sb,ppa,player_id,ab,ibb,roe,go,name
0,14,14,6,2,mlb,16.0,55,77.0,0.29,28.0,178,34,1,25,0.238,0.371,0.649,2,173,18,12.0,121,0.0,MLB,0,0.0,513,114,0,0.278,12.0,2,35,46,0.97,2,0.0,110313,480,2,0.0,34.0,"Arias, George"
1,1,5,83,5,mlb,11.0,26,103.0,0.228,43.0,86,14,1,16,0.134,0.165,0.325,0,306,13,3.0,218,0.0,MLB,1,0.0,621,70,0,0.16,10.0,0,33,26,2.39,1,0.45,110351,521,0,3.0,79.0,"Ashby, Andy"
2,0,2,3,4,mlb,0.0,2,11.0,0.133,5.0,3,1,1,3,0.065,0.097,0.244,0,323,1,1.0,16,0.0,MLB,0,0.0,37,2,0,0.147,1.0,0,0,0,13.0,0,0.51,111145,31,0,0.0,13.0,"Boehringer, Brian"
3,2,2,54,6,mlb,7.0,29,128.0,0.201,54.0,80,12,1,19,0.129,0.162,0.331,5,452,10,6.0,187,0.0,MLB,2,1.0,575,64,0,0.169,15.0,0,65,20,1.62,0,0.75,111554,495,0,4.0,105.0,"Brown, Kevin"
4,0,0,0,4,mlb,0.0,0,0.0,0.125,1.0,2,1,1,0,0.1,0.2,0.3,0,106,1,0.0,2,0.0,MLB,0,0.0,10,1,0,0.1,0.0,0,0,0,1.0,0,0.0,111636,10,0,0.0,1.0,"Bruske, Jim"
5,239,152,14,4,mlb,72.0,983,382.0,0.297,150.0,2809,604,1,727,0.272,0.447,0.794,29,1760,348,57.0,1163,0.0,MLB,67,2.0,7125,1710,39,0.347,91.0,17,221,894,1.03,88,0.22,111908,6288,112,8.0,228.0,"Caminiti, Ken"
7,0,2,43,4,mlb,20.0,12,196.0,0.189,53.0,40,6,1,14,0.095,0.115,0.254,4,188,5,1.0,174,0.0,MLB,1,0.0,410,33,0,0.139,12.0,1,31,22,4.9,0,2.18,136725,348,0,10.0,152.0,"Clement, Matt"
8,0,0,1,4,mlb,0.0,4,10.0,0.269,2.0,10,2,1,3,0.2,0.286,0.549,0,183,1,0.0,9,0.0,MLB,0,0.0,39,7,0,0.263,2.0,1,5,6,1.8,0,1.0,112940,35,0,0.0,9.0,"Cunnane, Will"
9,38,40,7,3,mlb,112.0,204,705.0,0.289,238.0,553,117,1,153,0.237,0.366,0.672,7,486,77,72.0,385,1.0,MLB,19,4.0,1698,358,8,0.306,170.0,2,386,168,1.06,9,2.72,150433,1512,12,19.0,408.0,"Davis, Ben"
10,304,152,91,8,mlb,364.0,1167,1864.0,0.285,737.0,4157,877,1,844,0.271,0.442,0.775,53,2583,449,271.0,1299,6.0,MLB,75,4.0,10460,2548,118,0.332,479.0,124,1321,1443,0.9,320,1.25,114135,9397,69,30.0,1194.0,"Finley, Steve"


**Your Turn!**

Choose two stats to plot in a scatter plot. Then, run a linear regression using `linregress`