In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm import tqdm_notebook
from time import sleep
import pandas as pd
import numpy as np
import re
import math
import datetime

In [2]:
driver_path = "drivers/chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)

# STATIZ에서 공격지표 크롤링

In [3]:
team_list = []
year_list = []
run_list = []
bbox_list = []
atbat_list = []
hit_list = []
twoh_list = []
thrh_list = []
hr_list = []
bb_list = []
hp_list = []
ibb_list = []
sf_list = []
tb_list = []

for year in tqdm_notebook(range(1982, 2019)):
    url = "http://www.statiz.co.kr/stat.php?opt=0&sopt=0&re=0&ys={year}&ye={year}&se=0&te=&tm=&ty=0&qu=auto&po=0&as=&ae=&hi=&un=&pl=&da=1&o1=WAR_ALL_ADJ&o2=TPA&de=1&lr=5&tr=&cv=&ml=1&sn=30&si=&cn="
    driver.get(url.format(year=year))
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    soup_rcd = soup.find_all("tbody")[1].find_all("td")
    for i in range(1,len(soup_rcd)//31):
        count_t = i*31+1
        count_r = i*31+7
        count_bb = i*31+5
        count_ab = i*31+6
        count_h = i*31+8
        count_tw = i*31+9
        count_th = i*31+10
        count_hr = i*31+11
        count_b = i*31+16
        count_hp = i*31+17
        count_ibb = i*31+18
        count_sf = i*31+22
        count_tb = i*31+12
        team_list.append(soup_rcd[count_t].get_text())
        year_list.append(year)
        run_list.append(int(soup_rcd[count_r].find("span").get_text()))
        bbox_list.append(int(soup_rcd[count_bb].find("span").get_text()))
        atbat_list.append(int(soup_rcd[count_ab].find("span").get_text()))
        hit_list.append(int(soup_rcd[count_h].find("span").get_text()))
        twoh_list.append(int(soup_rcd[count_tw].find("span").get_text()))
        thrh_list.append(int(soup_rcd[count_th].find("span").get_text()))
        hr_list.append(int(soup_rcd[count_hr].find("span").get_text()))
        bb_list.append(int(soup_rcd[count_b].find("span").get_text()))
        hp_list.append(int(soup_rcd[count_hp].find("span").get_text()))
        ibb_list.append(int(soup_rcd[count_ibb].find("span").get_text()))
        sf_list.append(int(soup_rcd[count_sf].find("span").get_text()))
        tb_list.append(int(soup_rcd[count_tb].find("span").get_text()))
        

HBox(children=(IntProgress(value=0, max=37), HTML(value='')))




# DATAFRAME 만들기

In [4]:
df = pd.DataFrame({'year':year_list, 
                   'team':team_list, 
                   'bbox': bbox_list,
                   'at_bat':atbat_list,
                   'total_hit':hit_list, 
                   '2b_hit': twoh_list,
                   '3b_hit': thrh_list,
                   'hr': hr_list,
                   'bb':bb_list, 
                   'hp':hp_list, 
                   'ibb': ibb_list,
                   'sf':sf_list, 
                   'tb':tb_list, 
                   'run':run_list})

In [5]:
df.head()

Unnamed: 0,year,team,bbox,at_bat,total_hit,2b_hit,3b_hit,hr,bb,hp,ibb,sf,tb,run
0,1982,MBC,3061,2686,757,124,12,65,268,47,20,27,1100,419
1,1982,삼성,3043,2647,705,126,18,57,307,30,2,18,1038,429
2,1982,OB,3098,2745,778,137,23,57,247,41,22,18,1132,399
3,1982,해태,2990,2665,696,110,14,84,235,41,12,21,1086,374
4,1982,롯데,3062,2628,674,112,8,59,326,40,8,27,979,353


# 1루타, 타율, 출루율, 장타율, OPS 계산

In [6]:
df['1b_hit'] = df['total_hit']-df['2b_hit']-df['3b_hit']-df['hr']
df['average'] = df['total_hit']/df['at_bat']
df['obp'] = (df['total_hit']+df['bb']+df['hp'])/(df['at_bat']+df['bb']+df['hp']+df['sf'])
df['slg'] = df['tb']/df['at_bat']
df['ops'] = df['obp']+df['slg']

In [7]:
df.head()

Unnamed: 0,year,team,bbox,at_bat,total_hit,2b_hit,3b_hit,hr,bb,hp,ibb,sf,tb,run,1b_hit,average,obp,slg,ops
0,1982,MBC,3061,2686,757,124,12,65,268,47,20,27,1100,419,556,0.281832,0.354029,0.409531,0.76356
1,1982,삼성,3043,2647,705,126,18,57,307,30,2,18,1038,429,504,0.266339,0.347102,0.392142,0.739244
2,1982,OB,3098,2745,778,137,23,57,247,41,22,18,1132,399,561,0.283424,0.349394,0.412386,0.76178
3,1982,해태,2990,2665,696,110,14,84,235,41,12,21,1086,374,488,0.261163,0.328157,0.407505,0.735661
4,1982,롯데,3062,2628,674,112,8,59,326,40,8,27,979,353,495,0.256469,0.344257,0.372527,0.716784


# STATIZ에서 wOBA, wRC 크롤링

In [8]:
team_list2 = []
year_list2 = []
woba_list = []
wrc_list = []
wobaj_list = []
wrcj_list = []

for year in tqdm_notebook(range(1982, 2019)):
    url = "http://www.statiz.co.kr/stat.php?opt=0&sopt=0&re=0&ys={year}&ye=1982&se=0&te=&tm=&ty=0&qu=auto&po=0&as=&ae=&hi=&un=&pl=&da=2&o1=WRCPLUS&de=1&o2=WAR_ALL&lr=5&tr=&cv=&ml=1&sn=30&si=&cn="
    driver.get(url.format(year=year))
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    soup_rcd = soup.find_all("tbody")[1].find_all("td")
    for i in range(1,len(soup_rcd)//23):
        count_t = i*23+1
        count_woba = i*23+14
        count_wrc = i*23+15
        count_wobaj = i*23+18
        count_wrcj = i*23+19
        team_list2.append(soup_rcd[count_t].get_text())
        year_list2.append(year)
        woba_list.append(float(soup_rcd[count_woba].find("span").get_text()))
        wrc_list.append(float(soup_rcd[count_wrc].find("span").get_text()))
        wobaj_list.append(float(soup_rcd[count_wobaj].find("span").get_text()))
        wrcj_list.append(float(soup_rcd[count_wrcj].find("span").get_text()))

HBox(children=(IntProgress(value=0, max=37), HTML(value='')))




# DATAFRAME 만들기

In [9]:
df2 = pd.DataFrame({'team':team_list2, 
                    'year':year_list2, 
                    'woba':woba_list, 
                    'wrc':wrc_list, 
                    'woba_pf': wobaj_list, 
                    'wrc_pf': wrcj_list})

In [10]:
df2.head()

Unnamed: 0,team,year,woba,wrc,woba_pf,wrc_pf
0,MBC,1982,0.346,419.8,0.338,419.0
1,OB,1982,0.343,416.7,0.341,416.6
2,삼성,1982,0.338,397.2,0.33,396.8
3,해태,1982,0.332,376.1,0.334,376.2
4,롯데,1982,0.329,377.8,0.321,378.0


# DATAFRAME 합치기

In [11]:
df_last = pd.merge(df, df2)

In [12]:
df_last.head()

Unnamed: 0,year,team,bbox,at_bat,total_hit,2b_hit,3b_hit,hr,bb,hp,...,run,1b_hit,average,obp,slg,ops,woba,wrc,woba_pf,wrc_pf
0,1982,MBC,3061,2686,757,124,12,65,268,47,...,419,556,0.281832,0.354029,0.409531,0.76356,0.346,419.8,0.338,419.0
1,1982,삼성,3043,2647,705,126,18,57,307,30,...,429,504,0.266339,0.347102,0.392142,0.739244,0.338,397.2,0.33,396.8
2,1982,OB,3098,2745,778,137,23,57,247,41,...,399,561,0.283424,0.349394,0.412386,0.76178,0.343,416.7,0.341,416.6
3,1982,해태,2990,2665,696,110,14,84,235,41,...,374,488,0.261163,0.328157,0.407505,0.735661,0.332,376.1,0.334,376.2
4,1982,롯데,3062,2628,674,112,8,59,326,40,...,353,495,0.256469,0.344257,0.372527,0.716784,0.329,377.8,0.321,378.0


# CSV로 저장

In [13]:
df_last.to_csv("team_stat.csv")