In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm import tqdm_notebook
from time import sleep
import pandas as pd
import numpy as np
import re
import math
import datetime

In [2]:
driver_path = "drivers/chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)

# STATIZ에서 공격지표 크롤링

In [3]:
team_o = []
run_o = []
year_o = []

for year in tqdm_notebook(range(1982, 2020)):
    url = "http://www.statiz.co.kr/stat.php?opt=0&sopt=0&re=0&ys={year}&ye={year}&se=0&te=&tm=&ty=0&qu=auto&po=0&as=&ae=&hi=&un=&pl=&da=1&o1=WAR_ALL_ADJ&o2=TPA&de=1&lr=5&tr=&cv=&ml=1&sn=30&si=&cn="
    driver.get(url.format(year=year))
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    soup_rcd = soup.find_all("tbody")[1].find_all("td")
    for i in range(1,len(soup_rcd)//31):
        count_t = i*31+1
        count_r = i*31+7
        team_o.append(soup_rcd[count_t].get_text())
        run_o.append(int(soup_rcd[count_r].find("span").get_text()))
        year_o.append(year)

HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




# DATAFRAME 만들기

In [4]:
df_o = pd.DataFrame({'year':year_o, 'team':team_o, 'run_o':run_o})

# STATIZ에서 수비지표 크롤링

In [5]:
year_d = []
team_d = []
run_d = []

for year in tqdm_notebook(range(1982, 2020)):
    url = "http://www.statiz.co.kr/stat.php?opt=0&sopt=0&re=1&ys={year}&ye={year}&se=0&te=&tm=&ty=0&qu=auto&po=0&as=&ae=&hi=&un=&pl=&da=1&o1=WAR&o2=OutCount&de=1&lr=5&tr=&cv=&ml=1&sn=30&si=&cn="
    driver.get(url.format(year=year))
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    soup_rcd = soup.find_all("tbody")[1].find_all("td")
    for i in range(1, len(soup_rcd)//33):
        count_t = i*33+1
        count_r = i*33+13
        year_d.append(year)
        team_d.append(soup_rcd[count_t].get_text())
        run_d.append(int(soup_rcd[count_r].find("span").get_text()))

HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




# DATAFRAME 만들기

In [6]:
df_d = pd.DataFrame({"year":year_d, "team": team_d, "run_d":run_d})

# KBO 홈페이지에서 승패기록 크롤링

In [7]:
year_list = [1980, 1990, 2000, 2010]
year_wl = []
team_wl = []
win = []
lose = []

for year in tqdm_notebook(year_list):
    url = "https://www.koreabaseball.com/Record/History/Team/Record.aspx?startYear={year}&halfSc=T"
    driver.get(url.format(year=year))
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    soup_rcd = soup.find_all("table")
    for i in range(len(soup_rcd)):
        for j in range(len(soup_rcd[i].find_all("th", scope="row"))):
            year_wl.append(int(soup_rcd[i].find_all("th", scope="col")[0].get_text()))
            team_wl.append(soup_rcd[i].find_all("th", scope="row")[j].get_text())
            win.append(int(soup_rcd[i].find_all("td")[j*7+1].get_text()))
            lose.append(int(soup_rcd[i].find_all("td")[j*7+2].get_text()))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




# DATAFRAME 만들기

In [8]:
df_wl = pd.DataFrame({"year":year_wl, "team":team_wl, "win":win, "lose":lose})

# 데이터 전처리

In [9]:
df_wl["team"] = df_wl["team"].apply(lambda e: e.split(" ")[0])
df_o = df_o.replace({"삼미/청보":"청보", "해태/KIA":"KIA", "kt":"KT"})
df_d = df_d.replace({"삼미/청보":"청보", "해태/KIA":"KIA", "kt":"KT"})
df_wl = df_wl.replace({"":"히어로즈", "우리":"히어로즈"})

# DATAFRAME 합치기

In [10]:
df_1 = pd.merge(df_o, df_d)
df_last = pd.merge(df_1, df_wl)

# 승률 및 피타고리안 승률 계산/추가

In [11]:
df_last["winning_rate"] = df_last["win"]/(df_last["win"]+df_last["lose"])
df_last["Pythagorean"] = (df_last["run_o"]^2)/((df_last["run_o"]^2) + (df_last["run_d"]^2))

In [12]:
df_last

Unnamed: 0,year,team,run_o,run_d,win,lose,winning_rate,Pythagorean
0,1982,MBC,419,350,46,34,0.575000,0.545098
1,1982,삼성,429,257,54,26,0.675000,0.624638
2,1982,OB,399,318,56,24,0.700000,0.556802
3,1982,해태,374,388,38,42,0.475000,0.488189
4,1982,롯데,353,385,31,49,0.387500,0.478437
5,1982,삼미,302,574,15,65,0.187500,0.344037
6,1983,삼성,448,418,46,50,0.479167,0.519630
7,1983,해태,423,390,55,44,0.555556,0.520396
8,1983,MBC,405,335,55,43,0.561224,0.550000
9,1983,OB,418,432,44,55,0.444444,0.489412


# CSV로 저장

In [13]:
df_last.to_csv("winning_rate_KBO.csv", index = False)