In [1]:
%matplotlib inline
import numpy as np
import random
import pandas as pd
import seaborn as sbn
sbn.set()
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV



In [2]:
#!/usr/bin/python3
"""
This class is meant to take a .evn/a file and parse it into usable tables representing player's stats per game.
"""
#import sys


class Parser:

    def __init__(self, file):
        self.raw_file = open(file, 'r')
        self.games = {}

    def parse(self):
        game_id = ""
        old_id = ""

        for line in self.raw_file:

            items = line.split(",")

            if items[0] == "id":
                if old_id != "":
                    hruns = 0
                    aruns = 0

                    for player, playerdata in self.games[game_id].items():
                        if player != "info":
                            if playerdata["team"] == "home":
                                hruns += playerdata["rbi"]
                            else:
                                aruns += playerdata["rbi"]
                    self.games[old_id]["info"]["winner"] = "home" if hruns > aruns else "away"

                old_id = game_id
                game_id = items[1].rstrip()
                data = {}
                self.games[game_id] = data

            elif items[0] == "info":
                if len(self.games[game_id]) == 0:
                    self.games[game_id]["info"] = ({items[1]: items[2].rstrip()})
                else:
                    self.games[game_id]["info"][items[1]] = items[2].rstrip()

            elif items[0] == "start" or items[0] == "sub":
                position = "pitcher" if items[5] == "1" else "batter"
                team = "home" if items[3] == "1" else "away"
                playerdict = self.games[game_id][items[1]] = {}
                playerdict["pos"] = position
                playerdict["team"] = team
                playerdict["ab"] = 0
                playerdict["h"] = 0
                playerdict["w"] = 0
                playerdict["w"] = 0
                playerdict["rbi"] = 0
                playerdict["er"] = 0

            elif items[0] == "play":

                player = items[3]
                playerdict = self.games[game_id][player]
                play = items[6].rstrip()

                #S, D, and T coded this way to allow future calculation of SLG
                if play.startswith("S"):
                    playerdict["ab"] += 1
                    playerdict["h"] += 1
                elif play.startswith("D"):
                    playerdict["ab"] += 1
                    playerdict["h"] += 1
                elif play.startswith("T"):
                    playerdict["ab"] += 1
                    playerdict["h"] += 1
                elif play.startswith("HR"):
                    playerdict["ab"] += 1
                    playerdict["h"] += 1
                    playerdict["rbi"] += 1
                elif play.startswith("W"):
                    playerdict["w"] += 1
                else:
                    playerdict["ab"] += 1

                runner_data = play.split(";")
                for runner in runner_data:
                    if "-H" in runner:
                        playerdict["rbi"] += 1

            elif items[0] == "data":
                if items[1] == "er":
                    playerdict = self.games[game_id][items[2]]
                    if playerdict["er"] is None:
                        playerdict["er"] = 0
                    playerdict["er"] += int(items[3].rstrip())
            else:  # ignore
                pass

        return self.games

    def make_into_tables(self):
        features = []
        classifs = []
        #data format:
        #[AVGh|RBIh|OBPh|AVGa|RBIa|OBPa] -aggregate sums of numbers for both teams

        for game_id, game in self.games.items():

            homestats = [0] * 3
            awaystats = [0] * 3

            for player, playerdata in self.games[game_id].items():
                if player != "info":  # ignore the info entry
                    if playerdata["pos"] == "pitcher":
                        pass  # for now, later possibly add some pitcher stats like ERA, H, W, etc.
                    elif playerdata["pos"] == "batter":
                        avg = 0
                        if int(playerdata["ab"]) != 0:
                            avg = int(playerdata["h"]) / int(playerdata["ab"])
                        rbi = int(playerdata["rbi"])
                        obp = 0
                        if int(playerdata["ab"]) != 0:
                            obp = (int(playerdata["w"]) + int(playerdata["h"])) / int(playerdata["ab"])
                        if playerdata["team"] == "home":
                            homestats[0] += avg
                            homestats[1] += rbi
                            homestats[2] += obp
                        else:
                            awaystats[0] += avg
                            awaystats[1] += rbi
                            awaystats[2] += obp

            features.append(homestats + awaystats)
            try:
                classifs.append(1 if game["info"]["winner"] == "home" else 0)
            except KeyError:
                classifs.append(0)

        return features, classifs

In [5]:
parser1 = Parser("2013MIN.EVA")
parser1.parse()
trainx, trainy = parser1.make_into_tables()

parser2 = Parser("2014MIN.EVA")
parser2.parse()
testx, testy = parser2.make_into_tables()

In [7]:
clf = LinearSVC()
clf.fit(trainx, trainy)
pred = clf.predict(testx)
print(classification_report(testy,pred))

             precision    recall  f1-score   support

          0       0.56      0.79      0.65        47
          1       0.33      0.15      0.20        34

avg / total       0.47      0.52      0.47        81

