In [1]:
import urllib
import xml.etree.ElementTree as ET

In [None]:
all_game_directories = []

days = range(1, 32)
months = range(4, 12)
years = range(2016, 2019)

for year in years:
    for month in months:
        for day in days:
            games_for_day_string = "http://gd2.mlb.com/components/game/mlb/year_{}/month_{:02d}/day_{:02d}/miniscoreboard.xml".format(year, month, day)
            try:
                response = urllib.request.urlopen(games_for_day_string)
                games = ET.fromstring(response.read())
                for game in games:
                    all_game_directories.append(game.get('game_data_directory'))
            except urllib.error.HTTPError as e:
                pass       


In [None]:
game_scoreboards = []

for game_dir in all_game_directories:
    game_dir_str = "http://gd2.mlb.com{}/miniscoreboard.xml".format(game_dir)
    try:
        response = urllib.request.urlopen(game_dir_str)

        game = ET.fromstring(response.read())
        
        # just get winning pitcher/ losing pitch info
        post_game = game.find('post_game')
        if post_game:
            winning_pitcher = post_game.find('winning_pitcher')
            losing_pitcher = post_game.find('losing_pitcher')
            game_scoreboards.append((winning_pitcher, losing_pitcher))
    except:
        pass


In [None]:
import numpy as np
def era(ers, ip):
    if ers > 0 and ip == 0:
        return np.inf
    if ers == 0 and ip == 0:
        return 0
    return ers/ip * 9

In [None]:
winning_pitcher_stats = []
losing_pitcher_stats = []

for winning_pitcher, losing_pitcher in game_scoreboards:

    winner_er, winner_ip = float(winning_pitcher.get('er', 0)), float(winning_pitcher.get('ip', 0))
    loser_er, loser_ip = float(losing_pitcher.get('er', 0)), float(losing_pitcher.get('ip', 0))
    
    winning_pitcher_stats.append((winner_er, winner_ip)) 
    losing_pitcher_stats.append((loser_er, loser_ip))

In [None]:
import pandas as pd

winning_pitcher_stats = pd.DataFrame(winning_pitcher_stats, columns=['er', 'ip'])
losing_pitcher_stats = pd.DataFrame(losing_pitcher_stats,  columns=['er', 'ip'])

In [None]:
winning_pitcher_stats['era'] = [era(row['er'], row['ip']) for i, row in winning_pitcher_stats.iterrows()]
losing_pitcher_stats['era'] = [era(row['er'], row['ip']) for i, row in losing_pitcher_stats.iterrows()]

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(3, 3)

for i, ax in enumerate(ax.flatten()):
    
    winning_data = winning_pitcher_stats['era'][winning_pitcher_stats['ip'] > i]
    losing_data = losing_pitcher_stats['era'][losing_pitcher_stats['ip'] > i]
    
    ax.hist(winning_data, bins=100, alpha=0.5, label="winning", color='blue')
    ax.hist(losing_data, bins=100, alpha=0.5, label="losing", color='red')

plt.show()

In [None]:
print(losing_pitcher_stats['era'].sort_values())

In [None]:
import math

In [None]:
fig, ax = plt.subplots(1, 1)


    
winning_data = winning_pitcher_stats['era'][winning_pitcher_stats['ip'] > 3]
losing_data = losing_pitcher_stats['era'][losing_pitcher_stats['ip'] > 3]

ax.hist(winning_data, bins=20, alpha=0.8, label="winning", color='blue')
ax.hist(losing_data, bins=20, alpha=0.5, label="losing", color='red')

plt.show()