In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm import tqdm_notebook
from time import sleep
import pandas as pd
import numpy as np
import re
import math
import datetime

In [2]:
driver_path = "driver/chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)

# 네이버 url 수집

In [3]:
months = ['03', '04', '05', '06', '07', '08', '09', '10']
url_list = []

for month in tqdm_notebook(months):
    url_schedule = 'https://sports.news.naver.com/kbaseball/schedule/index.nhn?month={month}&year=2018'
    driver.get(url_schedule.format(month=month))
    html_sch = driver.page_source
    soup_sch = BeautifulSoup(html_sch, "lxml")
    game_url = soup_sch.find("div", "tb_wrap").find_all("a")
    for j in range(len(game_url)):
        url_list.append(game_url[j].attrs['href'])
        
url_list = pd.DataFrame({'url':url_list})
url_list = url_list[url_list.url.str.contains('gameResult') == True]
url_list = url_list[url_list.url > '/gameCenter/gameResult.nhn?category=kbo&gameId=20180321LGWO02018']
url_list = url_list[url_list.url < '/gameCenter/gameResult.nhn?category=kbo&gameId=33331019WOHH02018']
url_list = list(url_list.url)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




# 네이버 기록 수집

In [4]:
table = []
loc = []
teams_H = []
teams_P = []
date_list = []

for url in tqdm_notebook(url_list):
    url_page = 'https://sports.news.naver.com{url}'
    driver.get(url_page.format(url=url))
    html = driver.page_source # 드라이버에서 현재 띄우고 있는 html을 주소 변수로 설정
    soup = BeautifulSoup(html, "lxml")
    view = soup.find("div", "ground_area")
    hit = view.find_all('div','ico hit')
    out = view.find_all('div','ico out')
    hrun = view.find_all('div','ico run')
    ha = re.findall('[A-Z][A-Z]', url)

    for i in range(len(hit)):
        table.append(hit[i].find('div').get_text())
    for i in range(len(out)):
        table.append(out[i].find('div').get_text())
    for i in range(len(hrun)):
        table.append(hrun[i].find('div').get_text())
    
    for i in range(len(hit)):
        loc.append(hit[i].attrs['style'])
    for i in range(len(out)):
        loc.append(out[i].attrs['style'])
    for i in range(len(hrun)):
        loc.append(hrun[i].attrs['style'])
    
    for i in range(len(hit)):
        if hit[i].attrs['id'].split('_')[0] == 'home':
            team = ha[1]
        else:
            team = ha[0]
        teams_H.append(team)
    for i in range(len(out)):
        if out[i].attrs['id'].split('_')[0] == 'home':
            team = ha[1]
        else:
            team = ha[0]
        teams_H.append(team)
    for i in range(len(hrun)):
        if hrun[i].attrs['id'].split('_')[0] == 'home':
            team = ha[1]
        else:
            team = ha[0]
        teams_H.append(team)
    
    for i in range(len(hit)):
        if hit[i].attrs['id'].split('_')[0] == 'home':
            team = ha[0]
        else:
            team = ha[1]
        teams_P.append(team)
    for i in range(len(out)):
        if out[i].attrs['id'].split('_')[0] == 'home':
            team = ha[0]
        else:
            team = ha[1]
        teams_P.append(team)
    for i in range(len(hrun)):
        if hrun[i].attrs['id'].split('_')[0] == 'home':
            team = ha[0]
        else:
            team = ha[1]
        teams_P.append(team)
        
    for i in range(len(hit)+len(out)+len(hrun)):
        date_list.append(re.search('\d\d\d\d\d\d\d\d', url).group())
    
    sleep(5/10);
    

HBox(children=(IntProgress(value=0, max=720), HTML(value='')))




# 수집 데이터 전처리

In [5]:
df = pd.DataFrame({'date':date_list, 'batter_team':teams_H, 'pitcher_team':teams_P, 'table':table, 'loc':loc})
df['X'] = df['loc'].apply(lambda e: e.split(';')[0])
df['Y'] = df['loc'].apply(lambda e: e.split(';')[1])
df = df.drop(columns = 'loc')

df['fr'] = df['table'].apply(lambda e: e.split('(')[0])
df['inning'] = df['fr'].apply(lambda e: e.split(' ')[0])
df['batter'] = df['fr'].apply(lambda e: e.split(' ')[1])
df['result'] = df['fr'].apply(lambda e: e.split(' ')[2])
df['ba'] = df['table'].apply(lambda e: e.split('(')[1])
df['cause'] = df['ba'].apply(lambda e: e.split(')')[0])
df['pitcher'] = df['ba'].apply(lambda e: e.split(')')[1])
df = df.drop(columns = ['table', 'fr', 'ba'])

df['X'] = df['X'].str.replace('left:', '')
df['X'] = df['X'].str.replace('px', '')
df['X'] = df['X'].str.replace(' ', '').astype(np.float64)
df['Y'] = df['Y'].str.replace('top:', '')
df['Y'] = df['Y'].str.replace('px', '')
df['Y'] = df['Y'].str.replace(' ', '').astype(np.float64)
df['pitcher'] = df['pitcher'].str.replace('상대투수-', '')
df['pitcher'] = df['pitcher'].str.replace(' ', '')

record = df[['date', 'inning', 'batter_team', 'batter', 'pitcher_team', 'pitcher', 'result', 'cause', 'X', 'Y']]
record = record.replace({'WO':'넥센', 'HH':'한화', 'HT':'KIA', 'OB':'두산', 'LT':'롯데', 'SS':'삼성', 'KT':'kt'})

# 필요없는 데이터 정제

In [6]:
record = record[record.cause.str.contains('삼진|낫 아웃|고의4구') == False]
record.result[record.cause.str.contains('출루') == True] = '범타'
record = record.reset_index(drop=True)

# 타구 방향 데이터 추가

In [7]:
def makeAngle(x, y):
    atan = math.atan2(-(y-311), (x-216.5))
    angle = 180 - (atan * (180 / math.pi))
    return angle

In [8]:
angle_list = []
for i in range(len(record)):
    angle_list.append(makeAngle(record.X[i], record.Y[i]))

In [9]:
record['angle'] = angle_list
record = record[['date', 'inning', 'batter_team', 'batter', 'pitcher_team', 'pitcher', 'result', 'cause', 'X', 'Y', 'angle']]

In [10]:
record.head()

Unnamed: 0,date,inning,batter_team,batter,pitcher_team,pitcher,result,cause,X,Y,angle
0,20180324,2회말,넥센,김하성,한화,샘슨,안타,좌중간 1루타,199.36,159.5,83.54527
1,20180324,3회말,넥센,박병호,한화,샘슨,안타,좌익수 앞 1루타,150.37,181.25,62.993314
2,20180324,4회말,넥센,고종욱,한화,샘슨,안타,우익수 앞 1루타,300.18,174.5,121.509975
3,20180324,4회말,넥센,이정후,한화,샘슨,안타,좌중간 2루타,148.95,100.25,72.228205
4,20180324,4회말,넥센,박동원,한화,샘슨,안타,중견수 왼쪽 1루타,207.17,131.0,87.032824


# CSV로 저장

In [11]:
record.to_csv('2018naver.csv', index=False)

# 경기일 리스트 생성

In [12]:
date_u = pd.Series(pd.Series(date_list).unique())

def changedate(value):
    afterformat = datetime.datetime.strptime(value, '%Y%m%d').strftime('%Y-%m-%d')
    return afterformat

date_ulist = date_u.apply(changedate)

# 스탯티즈 url 수집

In [13]:
url_list_statiz = []

for date in tqdm_notebook(date_ulist):
    url_url = 'http://www.statiz.co.kr/boxscore.php?opt=1&date={date}'
    driver.get(url_url.format(date=date))
    html_url = driver.page_source
    soup_url = BeautifulSoup(html_url, "lxml")
    soup_urls = soup_url.find_all('div', 'box')
    for i in range(len(soup_urls) - 2):
        url_list_statiz.append(soup_urls[2 + i].find_all('a')[-2].attrs['href'])
        
url_listS = pd.Series(url_list_statiz)
url_listS = url_listS[url_listS.str.contains('=5')]
url_list_statiz = list(url_listS)

HBox(children=(IntProgress(value=0, max=155), HTML(value='')))




# 스탯티즈 경기 로그 수집

In [14]:
inning_list = []
pitcher = []
batter = []
ballcnt = []
result = []
out = []
runner = []
score = []
batter_team = []
pitcher_team = []
gamedate_list = []

for url in tqdm_notebook(url_list_statiz):
    sit = []
    inning = []
    url_page = 'http://www.statiz.co.kr/{url}'
    driver.get(url_page.format(url=url))
    html = driver.page_source # 드라이버에서 현재 띄우고 있는 html을 주소 변수로 설정
    soup = BeautifulSoup(html, "lxml")
    lists = soup.find_all('table', 'table table-striped')[2].find_all('tr', 'oddrow_stz')
    lists = lists + soup.find_all('table', 'table table-striped')[2].find_all('tr', 'evenrow_stz')
    day = re.search('\d\d\d\d-\d\d-\d\d', url)[0]
    gamedate = day.replace("-","")
    away = soup.find_all('h3')[1].get_text()
    awayteam = re.findall('[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+|[a-zA-Z]+', away)[0]
    home = soup.find_all('h3')[2].get_text()
    hometeam = re.findall('[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+|[a-zA-Z]+', home)[0]
    
    for i in range(len(lists)):
        if lists[i].find_all('td')[0].get_text() != "":
            inning.append(lists[i].find_all('td')[0].get_text())
        else:
            inning.append(inning[i-1])
    inning_list = inning_list + inning
    for i in range(len(lists)):
        pitcher.append(lists[i].find_all('td')[1].get_text())

    for i in range(len(lists)):
        batter.append(lists[i].find_all('td')[2].find('a').get_text())
    
    for i in range(len(lists)):
        ballcnt.append(lists[i].find_all('td')[3].get_text())
    
    for i in range(len(lists)):
        result.append(lists[i].find_all('td')[4].get_text())
    
    for i in range(len(lists)):
        sit.append(lists[i].find_all('td')[5].get_text())
    
    for i in range(len(lists)):
        if len(sit[i].split()) > 0:
            out.append(sit[i].split()[0])
        else:
            out.append('')
        
    for i in range(len(lists)):
        if len(sit[i].split()) > 2:
            runner.append(sit[i].split()[1:-1][0])
        else:
            runner.append('')
        
    for i in range(len(lists)):
        if len(sit[i].split()) > 0:
            score.append(sit[i].split()[-1])
        else:
            score.append('')
            
    for i in range(len(lists)):
        gamedate_list.append(gamedate)
        
    for i in range(len(lists)):
        if re.search('[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]', inning[i]).group() == '초':
            batter_team.append(awayteam)
            pitcher_team.append(hometeam)
        else:
            batter_team.append(hometeam)
            pitcher_team.append(awayteam)
            
    sleep(2/10);

HBox(children=(IntProgress(value=0, max=720), HTML(value='')))




# 수집 데이터 전처리

In [15]:
record_statiz = pd.DataFrame({'date':gamedate_list,
                              'inning':inning_list,
                              'batter_team': batter_team,
                              'batter':batter,
                              'pitcher_team':pitcher_team,
                              'pitcher':pitcher,
                              'ball_count':ballcnt,
                              'score':score,
                              'out':out,
                              'runner':runner,
                              'results':result})

record_statiz = record_statiz[record_statiz.ball_count != '']
record_statiz['inning'] = record_statiz.inning.str.replace('초', '회초')
record_statiz['inning'] = record_statiz.inning.str.replace('말', '회말')
record_statiz['batter_team'] = record_statiz.batter_team.str.replace('KT', 'kt')
record_statiz['pitcher_team'] = record_statiz.pitcher_team.str.replace('KT', 'kt')
record_statiz['pitcher'] = record_statiz.pitcher.str.replace('김용성', '김건국')
record_statiz['pitcher'] = record_statiz.pitcher.str.replace('민성기', '민태호')
record_statiz['pitcher'] = record_statiz.pitcher.str.replace('김정훈', '김건태')
record_statiz['pitcher'] = record_statiz.pitcher.str.replace('최동현', '최원준')
record_statiz['pitcher'] = record_statiz.pitcher.str.replace('현기형', '현도훈')
record_statiz['pitcher'] = record_statiz.pitcher.str.replace('이영재', '이우찬')
record_statiz['batter'] = record_statiz.batter.str.replace('오승택', '오태곤')
record_statiz['batter'] = record_statiz.batter.str.replace('윤여운', '윤수강')
record_statiz['batter'] = record_statiz.batter.str.replace('배병옥', '배정대')
record_statiz['batter'] = record_statiz.batter.str.replace('김사연', '김지열')
record_statiz['batter'] = record_statiz.batter.str.replace('김동명', '김동욱')
record_statiz.runner[record_statiz.runner == ""] = '주자없음'

# 필요없는 데이터 삭제

In [16]:
record_statiz = record_statiz[record_statiz.results.str.contains('삼진|볼넷|몸에 맞는 볼|고의4구|스트라이크낫아웃 출루|쓰리번트') == False]

In [17]:
record_statiz.head()

Unnamed: 0,date,inning,batter_team,batter,pitcher_team,pitcher,ball_count,score,out,runner,results
1,20180324,1회초,삼성,김상수,두산,린드블럼,3(1-1),0:0,1사,주자없음,2루수 땅볼
3,20180324,1회초,삼성,러프,두산,린드블럼,2(1-0),0:0,2사,1루,좌익수 2루타
4,20180324,1회초,삼성,강민호,두산,린드블럼,1(0-0),1:0,2사,2루,포수 파울 뜬공
5,20180324,2회초,삼성,이원석,두산,린드블럼,3(1-1),1:0,무사,주자없음,중견수 뜬공
7,20180324,2회초,삼성,김헌곤,두산,린드블럼,8(2-3),1:0,2사,주자없음,2루수 안타


# CSV로 저장

In [18]:
record_statiz.to_csv('2018statiz.csv', index=False)

# CSV 불러오기

In [19]:
naver = pd.read_csv("2018naver.csv")
statiz = pd.read_csv("2018statiz.csv")

# 데이터 조인

In [20]:
combine = pd.merge(naver, statiz, how='outer')

# 컬럼 정렬

In [21]:
combine = combine[['date', 'inning',
                   'score', 'out', 'runner',
                   'batter_team', 'batter',
                   'pitcher_team', 'pitcher',
                   'ball_count',
                   'result', 'cause',
                   'X', 'Y', 'angle']]

In [22]:
combine.head()

Unnamed: 0,date,inning,score,out,runner,batter_team,batter,pitcher_team,pitcher,ball_count,result,cause,X,Y,angle
0,20180324,2회말,2:0,1사,주자없음,넥센,김하성,한화,샘슨,7(2-3),안타,좌중간 1루타,199.36,159.5,83.54527
1,20180324,3회말,2:1,2사,주자없음,넥센,박병호,한화,샘슨,4(1-2),안타,좌익수 앞 1루타,150.37,181.25,62.993314
2,20180324,4회말,2:1,무사,주자없음,넥센,고종욱,한화,샘슨,1(0-0),안타,우익수 앞 1루타,300.18,174.5,121.509975
3,20180324,4회말,2:1,1사,2루,넥센,이정후,한화,샘슨,2(0-1),안타,좌중간 2루타,148.95,100.25,72.228205
4,20180324,4회말,2:2,1사,3루,넥센,박동원,한화,샘슨,6(2-3),안타,중견수 왼쪽 1루타,207.17,131.0,87.032824


# 필요없는 데이터 삭제

In [23]:
combine = combine[combine.cause.str.contains('번트|삼진|포수 파울플라이|낫 아웃') == False]

In [24]:
combine.head()

Unnamed: 0,date,inning,score,out,runner,batter_team,batter,pitcher_team,pitcher,ball_count,result,cause,X,Y,angle
0,20180324,2회말,2:0,1사,주자없음,넥센,김하성,한화,샘슨,7(2-3),안타,좌중간 1루타,199.36,159.5,83.54527
1,20180324,3회말,2:1,2사,주자없음,넥센,박병호,한화,샘슨,4(1-2),안타,좌익수 앞 1루타,150.37,181.25,62.993314
2,20180324,4회말,2:1,무사,주자없음,넥센,고종욱,한화,샘슨,1(0-0),안타,우익수 앞 1루타,300.18,174.5,121.509975
3,20180324,4회말,2:1,1사,2루,넥센,이정후,한화,샘슨,2(0-1),안타,좌중간 2루타,148.95,100.25,72.228205
4,20180324,4회말,2:2,1사,3루,넥센,박동원,한화,샘슨,6(2-3),안타,중견수 왼쪽 1루타,207.17,131.0,87.032824


# CSV로 저장

In [25]:
combine.to_csv('2018kbo.csv', index=False)