In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

# 시즌 시작일과 종료일 설정
start_date = datetime(2021, 4, 1)  # 2021 시즌 첫 게임 날짜
end_date = datetime(2024, 7, 5)  # 2024년 7월 5일

# 데이터를 저장할 빈 DataFrame 생성
columns = ["Date", "Rank", "Team", "Year", "AVG", "G", "PA", "AB", "R", "H", "2B", "3B", "HR", "TB", "RBI", "SB", "CS", "GO", "FO", "BB", "HP", "IB", "SO", "GDP", "SH", "SF", "AVG", "OBP", "SLG", "OPS"]
team_batting_data = pd.DataFrame(columns=columns)

# 날짜별로 데이터를 가져오기
current_date = start_date
while current_date <= end_date:
    # 날짜를 URL에 맞는 형식으로 변환
    date_str = current_date.strftime("%m-%d")  # 'MM-DD' 형식
    year_str = current_date.year  # 현재 날짜의 연도

    # URL 생성 (연도도 동적으로 바뀌게)
    url = f"https://statiz.sporki.com/stats/?m=team&m2=batting&m3=situation1&so=&ob=&year={year_str}&sy={year_str}&ey={year_str}&te=&po=&lt=10100&reg=&pe=I&ds={date_str}&de={date_str}&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=50&ph=&hs=&us=&na=&ls=&sf1=&sk1=&sv1=&sf2=&sk2=&sv2="

    # 페이지 가져오기
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    # 테이블 추출
    tables = soup.find_all('table')
    if len(tables) < 2:  # 테이블이 없으면 다음 날짜로 넘어감
        print(f"{current_date.strftime('%Y-%m-%d')} 데이터 없음")
        current_date += timedelta(days=1)
        continue

    table = tables[1]
    rows = table.find('tbody').find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        data.append(cols)

    # DataFrame 변환
    df = pd.DataFrame(data, columns=columns[1:])  # "Date" 제외한 열
    df.insert(0, "Date", current_date.strftime('%Y-%m-%d'))  # Date 열 추가

    # 전체 데이터에 합치기
    team_batting_data = pd.concat([team_batting_data, df], ignore_index=True)
    print(f"{current_date.strftime('%Y-%m-%d')} 데이터 수집 완료")

    # 다음 날짜로 이동
    current_date += timedelta(days=1)

print("모든 데이터 수집 완료")
# 최종 데이터프레임 출력
team_batting_data


2021-04-01 데이터 수집 완료
2021-04-02 데이터 수집 완료
2021-04-03 데이터 수집 완료
2021-04-04 데이터 수집 완료
2021-04-05 데이터 수집 완료
2021-04-06 데이터 수집 완료
2021-04-07 데이터 수집 완료
2021-04-08 데이터 수집 완료
2021-04-09 데이터 수집 완료
2021-04-10 데이터 수집 완료
2021-04-11 데이터 수집 완료
2021-04-12 데이터 수집 완료
2021-04-13 데이터 수집 완료
2021-04-14 데이터 수집 완료
2021-04-15 데이터 수집 완료
2021-04-16 데이터 수집 완료
2021-04-17 데이터 수집 완료
2021-04-18 데이터 수집 완료
2021-04-19 데이터 수집 완료
2021-04-20 데이터 수집 완료
2021-04-21 데이터 수집 완료
2021-04-22 데이터 수집 완료
2021-04-23 데이터 수집 완료
2021-04-24 데이터 수집 완료
2021-04-25 데이터 수집 완료
2021-04-26 데이터 수집 완료
2021-04-27 데이터 수집 완료
2021-04-28 데이터 수집 완료
2021-04-29 데이터 수집 완료
2021-04-30 데이터 수집 완료
2021-05-01 데이터 수집 완료
2021-05-02 데이터 수집 완료
2021-05-03 데이터 수집 완료
2021-05-04 데이터 수집 완료
2021-05-05 데이터 수집 완료
2021-05-06 데이터 수집 완료
2021-05-07 데이터 수집 완료
2021-05-08 데이터 수집 완료
2021-05-09 데이터 수집 완료
2021-05-10 데이터 수집 완료
2021-05-11 데이터 수집 완료
2021-05-12 데이터 수집 완료
2021-05-13 데이터 수집 완료
2021-05-14 데이터 수집 완료
2021-05-15 데이터 수집 완료
2021-05-16 데이터 수집 완료
2021-05-17 데이터 수집 완료
2021-05-18 데이

Unnamed: 0,Date,Rank,Team,Year,AVG,G,PA,AB,R,H,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,2021-04-03,1,키움,21,0.250,13,36,32,6,8,...,0,1,8,0,0,0,0.250,0.333,0.375,0.708
1,2021-04-03,2,삼성,21,0.194,9,32,31,1,6,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413
2,2021-04-04,1,키움,21,0.344,13,40,32,7,11,...,2,0,6,1,0,0,0.344,0.475,0.563,1.038
3,2021-04-04,2,롯데,21,0.324,13,38,37,3,12,...,0,0,5,1,0,0,0.324,0.342,0.514,0.856
4,2021-04-04,3,SSG,21,0.290,10,34,31,5,9,...,0,0,8,2,0,0,0.290,0.353,0.710,1.063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5043,2024-07-04,6,SSG,24,0.175,15,44,40,0,7,...,0,0,11,0,2,0,0.175,0.214,0.225,0.439
5044,2024-07-04,7,삼성,24,0.161,13,33,31,3,5,...,0,0,8,0,0,0,0.161,0.212,0.355,0.567
5045,2024-07-04,8,키움,24,0.129,12,36,31,2,4,...,1,0,7,0,0,0,0.129,0.250,0.161,0.411
5046,2024-07-04,9,롯데,24,0.121,14,42,33,3,4,...,2,0,13,0,0,0,0.121,0.310,0.182,0.492


In [2]:
team_batting_data

Unnamed: 0,Date,Rank,Team,Year,AVG,G,PA,AB,R,H,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,2021-04-03,1,키움,21,0.250,13,36,32,6,8,...,0,1,8,0,0,0,0.250,0.333,0.375,0.708
1,2021-04-03,2,삼성,21,0.194,9,32,31,1,6,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413
2,2021-04-04,1,키움,21,0.344,13,40,32,7,11,...,2,0,6,1,0,0,0.344,0.475,0.563,1.038
3,2021-04-04,2,롯데,21,0.324,13,38,37,3,12,...,0,0,5,1,0,0,0.324,0.342,0.514,0.856
4,2021-04-04,3,SSG,21,0.290,10,34,31,5,9,...,0,0,8,2,0,0,0.290,0.353,0.710,1.063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5043,2024-07-04,6,SSG,24,0.175,15,44,40,0,7,...,0,0,11,0,2,0,0.175,0.214,0.225,0.439
5044,2024-07-04,7,삼성,24,0.161,13,33,31,3,5,...,0,0,8,0,0,0,0.161,0.212,0.355,0.567
5045,2024-07-04,8,키움,24,0.129,12,36,31,2,4,...,1,0,7,0,0,0,0.129,0.250,0.161,0.411
5046,2024-07-04,9,롯데,24,0.121,14,42,33,3,4,...,2,0,13,0,0,0,0.121,0.310,0.182,0.492


In [3]:
team_batting_data = team_batting_data.drop(columns=["Rank", "Year"])
team_batting_data

Unnamed: 0,Date,Team,AVG,G,PA,AB,R,H,2B,3B,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,2021-04-03,키움,0.250,13,36,32,6,8,4,0,...,0,1,8,0,0,0,0.250,0.333,0.375,0.708
1,2021-04-03,삼성,0.194,9,32,31,1,6,0,0,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413
2,2021-04-04,키움,0.344,13,40,32,7,11,1,0,...,2,0,6,1,0,0,0.344,0.475,0.563,1.038
3,2021-04-04,롯데,0.324,13,38,37,3,12,1,0,...,0,0,5,1,0,0,0.324,0.342,0.514,0.856
4,2021-04-04,SSG,0.290,10,34,31,5,9,1,0,...,0,0,8,2,0,0,0.290,0.353,0.710,1.063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5043,2024-07-04,SSG,0.175,15,44,40,0,7,2,0,...,0,0,11,0,2,0,0.175,0.214,0.225,0.439
5044,2024-07-04,삼성,0.161,13,33,31,3,5,0,0,...,0,0,8,0,0,0,0.161,0.212,0.355,0.567
5045,2024-07-04,키움,0.129,12,36,31,2,4,1,0,...,1,0,7,0,0,0,0.129,0.250,0.161,0.411
5046,2024-07-04,롯데,0.121,14,42,33,3,4,2,0,...,2,0,13,0,0,0,0.121,0.310,0.182,0.492


In [6]:
team_batting_data.to_csv('team_batting_final.csv', index=False, encoding='utf-8-sig')

In [None]:
# 21시즌 완성본
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

# 시즌 시작일과 종료일 설정
start_date = datetime(2021, 4, 3)  # 시즌 시작일
end_date = datetime(2021, 10, 31)  # 시즌 종료일

# 데이터를 저장할 빈 DataFrame 생성
columns = ["Date", "Rank", "Team", "Year", "AVG", "G", "PA", "AB", "R", "H", "2B", "3B", "HR", "TB", "RBI", "SB", "CS", "GO", "FO", "BB", "HP", "IB", "SO", "GDP", "SH", "SF", "AVG", "OBP", "SLG", "OPS"]
all_data = pd.DataFrame(columns=columns)

# 날짜별로 데이터를 가져오기
current_date = start_date
while current_date <= end_date:
    # 날짜를 URL에 맞는 형식으로 변환
    date_str = current_date.strftime("%m-%d")  # 'MM-DD' 형식
    url = f"https://statiz.sporki.com/stats/?m=team&m2=batting&m3=situation1&so=&ob=&year=2021&sy=2021&ey=2021&te=&po=&lt=10100&reg=&pe=I&ds={date_str}&de={date_str}&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=50&ph=&hs=&us=&na=&ls=&sf1=&sk1=&sv1=&sf2=&sk2=&sv2="

    # 페이지 가져오기
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    # 테이블 추출
    tables = soup.find_all('table')
    if len(tables) < 2:  # 테이블이 없으면 다음 날짜로 넘어감
        print(f"{current_date.strftime('%Y-%m-%d')} 데이터 없음")
        current_date += timedelta(days=1)
        continue

    table = tables[1]
    rows = table.find('tbody').find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        data.append(cols)

    # DataFrame 변환
    df = pd.DataFrame(data, columns=columns[1:])  # "Date" 제외한 열
    df.insert(0, "Date", current_date.strftime('%Y-%m-%d'))  # Date 열 추가

    # 전체 데이터에 합치기
    all_data = pd.concat([all_data, df], ignore_index=True)

    #print(f"{current_date.strftime('%Y-%m-%d')} 데이터 수집 완료")

    # 다음 날짜로 이동
    current_date += timedelta(days=1)

print("모든 데이터 수집 완료")
# 최종 데이터프레임 출력
all_data
all_data.to_csv('statiz_team_batting_records.csv', index=False)

모든 데이터 수집 완료


Unnamed: 0,Date,Rank,Team,Year,AVG,G,PA,AB,R,H,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,2021-04-03,1,키움,21,0.250,13,36,32,6,8,...,0,1,8,0,0,0,0.250,0.333,0.375,0.708
1,2021-04-03,2,삼성,21,0.194,9,32,31,1,6,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413
2,2021-04-04,1,키움,21,0.344,13,40,32,7,11,...,2,0,6,1,0,0,0.344,0.475,0.563,1.038
3,2021-04-04,2,롯데,21,0.324,13,38,37,3,12,...,0,0,5,1,0,0,0.324,0.342,0.514,0.856
4,2021-04-04,3,SSG,21,0.290,10,34,31,5,9,...,0,0,8,2,0,0,0.290,0.353,0.710,1.063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,2021-10-30,6,NC,21,0.278,12,40,36,5,10,...,0,0,7,2,0,0,0.278,0.350,0.472,0.822
1372,2021-10-30,7,두산,21,0.242,15,39,33,5,8,...,0,1,5,0,1,1,0.242,0.316,0.364,0.680
1373,2021-10-30,8,한화,21,0.206,13,35,34,3,7,...,0,0,8,0,0,0,0.206,0.229,0.265,0.494
1374,2021-10-30,9,KIA,21,0.161,10,31,31,1,5,...,0,0,7,1,0,0,0.161,0.161,0.258,0.419


In [None]:
all_data

Unnamed: 0,Date,Rank,Team,Year,AVG,G,PA,AB,R,H,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,2021-04-03,1,키움,21,0.250,13,36,32,6,8,...,0,1,8,0,0,0,0.250,0.333,0.375,0.708
1,2021-04-03,2,삼성,21,0.194,9,32,31,1,6,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413
2,2021-04-04,1,키움,21,0.344,13,40,32,7,11,...,2,0,6,1,0,0,0.344,0.475,0.563,1.038
3,2021-04-04,2,롯데,21,0.324,13,38,37,3,12,...,0,0,5,1,0,0,0.324,0.342,0.514,0.856
4,2021-04-04,3,SSG,21,0.290,10,34,31,5,9,...,0,0,8,2,0,0,0.290,0.353,0.710,1.063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,2021-10-30,6,NC,21,0.278,12,40,36,5,10,...,0,0,7,2,0,0,0.278,0.350,0.472,0.822
1372,2021-10-30,7,두산,21,0.242,15,39,33,5,8,...,0,1,5,0,1,1,0.242,0.316,0.364,0.680
1373,2021-10-30,8,한화,21,0.206,13,35,34,3,7,...,0,0,8,0,0,0,0.206,0.229,0.265,0.494
1374,2021-10-30,9,KIA,21,0.161,10,31,31,1,5,...,0,0,7,1,0,0,0.161,0.161,0.258,0.419


In [None]:
#이게 진짜임
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# URL 가져오기
url = "https://statiz.sporki.com/stats/?m=team&m2=batting&m3=situation1&so=&ob=&year=2021&sy=2021&ey=2021&te=&po=&lt=10100&reg=&pe=I&ds=04-03&de=04-03&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=50&ph=&hs=&us=&na=&ls=&sf1=&sk1=&sv1=&sf2=&sk2=&sv2="
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

# URL에서 날짜 추출
date_match = re.search(r'ds=(\d{2}-\d{2})&de=', url)
if date_match:
    date = f"2021-{date_match.group(1)}"  # 2021년을 추가 (URL에서 명시되어 있음)
else:
    date = "Unknown"  # 날짜를 추출하지 못한 경우

# 테이블 찾기
tables = soup.find_all('table')  # 페이지에 있는 모든 테이블 가져오기
print(f"총 {len(tables)}개의 테이블이 발견되었습니다.")

# 첫 번째 테이블 가져오기
table = tables[1]  # 필요한 경우 인덱스를 조정

# 테이블 데이터 추출
rows = table.find('tbody').find_all('tr')
data = []
for row in rows:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# pandas DataFrame으로 변환
columns = ["Rank", "Team", "Year", "AVG", "G", "PA", "AB", "R", "H", "2B", "3B", "HR", "TB", "RBI", "SB", "CS", "GO", "FO", "BB", "HP", "IB", "SO", "GDP", "SH", "SF", "AVG", "OBP", "SLG", "OPS"]
df = pd.DataFrame(data, columns=columns)

# 날짜(Date) 열 추가
df.insert(0, "Date", date)  # 데이터프레임의 첫 번째 열에 추가

df


총 2개의 테이블이 발견되었습니다.


Unnamed: 0,Date,Rank,Team,Year,AVG,G,PA,AB,R,H,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,2021-04-03,1,키움,21,0.25,13,36,32,6,8,...,0,1,8,0,0,0,0.25,0.333,0.375,0.708
1,2021-04-03,2,삼성,21,0.194,9,32,31,1,6,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL 가져오기
url = "https://statiz.sporki.com/stats/?m=team&m2=batting&m3=situation1&so=&ob=&year=2021&sy=2021&ey=2021&te=&po=&lt=10100&reg=&pe=I&ds=04-03&de=04-03&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=50&ph=&hs=&us=&na=&ls=&sf1=&sk1=&sv1=&sf2=&sk2=&sv2="
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

# 테이블 찾기
tables = soup.find_all('table')  # 페이지에 있는 모든 테이블 가져오기
print(f"총 {len(tables)}개의 테이블이 발견되었습니다.")

# 첫 번째 테이블 가져오기
table = tables[1]  # 필요한 경우 인덱스를 조정

# 테이블 데이터 추출
rows = table.find('tbody').find_all('tr')
data = []
for row in rows:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# pandas DataFrame으로 변환
columns = ["Rank", "Team", "Year", "AVG", "G", "PA", "AB", "R", "H", "2B", "3B", "HR", "TB", "RBI", "SB", "CS", "GO", "FO", "BB", "HP", "IB", "SO", "GDP", "SH", "SF", "AVG", "OBP", "SLG", "OPS"]
df = pd.DataFrame(data, columns=columns)

# 결과 출력
df

총 2개의 테이블이 발견되었습니다.


Unnamed: 0,Rank,Team,Year,AVG,G,PA,AB,R,H,2B,...,HP,IB,SO,GDP,SH,SF,AVG.1,OBP,SLG,OPS
0,1,키움,21,0.25,13,36,32,6,8,4,...,0,1,8,0,0,0,0.25,0.333,0.375,0.708
1,2,삼성,21,0.194,9,32,31,1,6,0,...,0,0,3,1,0,0,0.194,0.219,0.194,0.413




In [None]:
df = pd.read_csv('statiz_team_batting_records.csv')

In [None]:
df

Unnamed: 0,2B,3B,AB,BB,CS,Date,FO,G,GDP,GO,...,SB,SF,SH,SO,Season,TB,Team,Year,비율,정렬▼
0,4,0,63,5,0,2021-04-03,10,22,1,27,...,0,0,0,11,리그,18,Unknown,21,0.222,0.222
1,9,1,326,37,3,2021-04-04,81,119,10,86,...,11,2,3,72,리그,127,Unknown,21,0.255,0.255
2,11,1,336,25,4,2021-04-06,76,121,5,95,...,7,4,2,88,리그,113,Unknown,21,0.217,0.217
3,20,0,365,60,2,2021-04-07,84,138,7,111,...,7,1,4,65,리그,127,Unknown,21,0.268,0.268
4,9,2,323,54,3,2021-04-08,89,126,10,95,...,7,3,4,65,리그,111,Unknown,21,0.229,0.229
5,21,2,352,39,1,2021-04-09,75,128,8,98,...,5,5,2,76,리그,140,Unknown,21,0.293,0.293
6,26,0,352,42,4,2021-04-10,74,125,5,92,...,8,2,1,77,리그,163,Unknown,21,0.287,0.287
7,14,2,334,42,4,2021-04-11,88,118,4,85,...,6,2,6,83,리그,110,Unknown,21,0.231,0.231
8,8,1,335,46,1,2021-04-13,85,117,9,91,...,6,4,2,73,리그,108,Unknown,21,0.248,0.248
9,15,0,333,34,3,2021-04-14,87,123,9,101,...,10,3,4,73,리그,94,Unknown,21,0.201,0.201
