# 데이터 입출력
### - csv, excel 파일에서 데이터 읽고 쓰기

In [1]:
import pandas as pd
from pandas import Series, DataFrame

## 1. csv 파일 읽기 - read_csv()

In [3]:
# 1) 기본 csv 파일 읽기
### data/ex1.csv 읽기 (컬럼명이 존재하는 csv 파일)
pd.read_csv('data/ex1.csv')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
# 2) sep 인자 활용하기 
### 구분자가 다른 파일(data/ex2.txt) 읽기 (sep 인자)
pd.read_csv('data/ex2.txt', sep= ' ')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
# 3) encoding 인자 활용하기 
### data/2020KBO야구.csv 파일 읽기
pd.read_csv('data/2020KBO야구.csv', encoding= 'cp949')

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,페르난데스,두산,0.340,199,21,40000.0
1,허경민,두산,0.332,145,7,48000.0
2,오재일,두산,0.312,147,16,47000.0
3,최주환,두산,0.306,156,16,27000.0
4,박건우,두산,0.304,148,14,45000.0
...,...,...,...,...,...,...
254,김재현,SK,0.143,2,0,5600.0
255,화이트,SK,0.136,3,1,
256,채현우,SK,0.130,3,0,3000.0
257,류효승,SK,0.125,1,1,2700.0


In [9]:
# 4) header, names 인자 활용하기 (컬럼명이 파일에 포함되어 있지 않은 경우)
pd.read_csv('data/2020KBO야구_컬럼명미포함.csv', encoding= 'cp949',
           header=None)

Unnamed: 0,0,1,2,3,4,5
0,페르난데스,두산,0.340,199,21,40000.0
1,허경민,두산,0.332,145,7,48000.0
2,오재일,두산,0.312,147,16,47000.0
3,최주환,두산,0.306,156,16,27000.0
4,박건우,두산,0.304,148,14,45000.0
...,...,...,...,...,...,...
254,김재현,SK,0.143,2,0,5600.0
255,화이트,SK,0.136,3,1,
256,채현우,SK,0.130,3,0,3000.0
257,류효승,SK,0.125,1,1,2700.0


In [11]:
pd.read_csv('data/2020KBO야구_컬럼명미포함.csv', encoding= 'cp949',
           names = ['선수명','팀명','타율','안타','홈런','연봉'])

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,페르난데스,두산,0.340,199,21,40000.0
1,허경민,두산,0.332,145,7,48000.0
2,오재일,두산,0.312,147,16,47000.0
3,최주환,두산,0.306,156,16,27000.0
4,박건우,두산,0.304,148,14,45000.0
...,...,...,...,...,...,...
254,김재현,SK,0.143,2,0,5600.0
255,화이트,SK,0.136,3,1,
256,채현우,SK,0.130,3,0,3000.0
257,류효승,SK,0.125,1,1,2700.0


In [13]:
# 5) 불필요한 줄은 제외하고 데이터 읽기 
# 5-1) skiprows를 이용하여, 특정 행을 읽지 않도록 함.
pd.read_csv('data/2020KBO야구_주석포함.csv', encoding= 'cp949',
           skiprows=[0,1,2],
           names = ['선수명','팀명','타율','안타','홈런','연봉'])

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,페르난데스,두산,0.340,199,21,40000.0
1,허경민,두산,0.332,145,7,48000.0
2,오재일,두산,0.312,147,16,47000.0
3,최주환,두산,0.306,156,16,27000.0
4,박건우,두산,0.304,148,14,45000.0
...,...,...,...,...,...,...
254,김재현,SK,0.143,2,0,5600.0
255,화이트,SK,0.136,3,1,
256,채현우,SK,0.130,3,0,3000.0
257,류효승,SK,0.125,1,1,2700.0


In [17]:
# 5-2) comment 인자를 이용하여, 주석은 데이터로 읽지 않음.

In [15]:
pd.read_csv('data/2020KBO야구_주석포함.csv', encoding= 'cp949',
           comment='*',
           names = ['선수명','팀명','타율','안타','홈런','연봉'],
           skip_blank_lines=False)

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,,,,,,
1,페르난데스,두산,0.340,199.0,21.0,40000.0
2,허경민,두산,0.332,145.0,7.0,48000.0
3,오재일,두산,0.312,147.0,16.0,47000.0
4,최주환,두산,0.306,156.0,16.0,27000.0
...,...,...,...,...,...,...
255,김재현,SK,0.143,2.0,0.0,5600.0
256,화이트,SK,0.136,3.0,1.0,
257,채현우,SK,0.130,3.0,0.0,3000.0
258,류효승,SK,0.125,1.0,1.0,2700.0


In [19]:
# 6) 용량이 매우 큰 파일 읽기 

In [18]:
pd.read_csv('data/2020KBO야구.csv', encoding= 'cp949', nrows = 10)

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,페르난데스,두산,0.34,199,21,40000
1,허경민,두산,0.332,145,7,48000
2,오재일,두산,0.312,147,16,47000
3,최주환,두산,0.306,156,16,27000
4,박건우,두산,0.304,148,14,45000
5,정수빈,두산,0.298,146,5,34000
6,최용제,두산,0.295,13,0,2800
7,김재호,두산,0.289,116,2,65000
8,안권수,두산,0.27,10,0,2700
9,박세혁,두산,0.269,97,4,23200


In [21]:
커서 = pd.read_csv('data/2020KBO야구.csv', encoding= 'cp949', chunksize = 10)

In [27]:
next(커서)

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
50,최영진,삼성,0.297,30,1,8000
51,박해민,삼성,0.29,142,11,30000
52,강민호,삼성,0.287,102,19,125000
53,김성윤,삼성,0.286,2,0,2900
54,이성곤,삼성,0.281,39,5,3500
55,살라디노,삼성,0.28,37,6,70000
56,송준석,삼성,0.277,13,1,4000
57,이원석,삼성,0.268,108,13,30000
58,박승규,삼성,0.258,47,1,3200
59,김헌곤,삼성,0.248,63,3,19000


In [28]:
데이터목록 = []
for data in 커서:
    데이터목록.append(data)

In [29]:
데이터목록[2]

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
80,변상권,키움,0.274,17,1,3700
81,임병욱,키움,0.27,10,0,8800
82,허정협,키움,0.268,84,10,3500
83,러셀,키움,0.254,62,2,54000
84,박동원,키움,0.25,81,12,22500
85,박주홍,키움,0.25,6,0,2700
86,박준태,키움,0.245,85,5,4500
87,전병우,키움,0.237,85,8,2900
88,김주형,키움,0.233,10,1,2700
89,박병호,키움,0.223,69,21,200000


In [11]:
# 7. 결과를 csv 파일로 저장하기 - to_csv()


In [30]:
data=pd.read_csv('data/2020KBO야구.csv', encoding= 'cp949')
result = data.pivot_table(index = '팀명',values = '안타', aggfunc = 'sum')

In [32]:
result.to_csv('data/팀별안타수.csv',encoding= 'cp949')

## 2. 엑셀 파일 읽기 (read_excel())

In [33]:
pd.read_excel('data/2020KBO야구.xlsx')

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,페르난데스,두산,0.34,199,21,40000
1,허경민,두산,0.332,145,7,48000
2,오재일,두산,0.312,147,16,47000
3,최주환,두산,0.306,156,16,27000
4,박건우,두산,0.304,148,14,45000
5,정수빈,두산,0.298,146,5,34000
6,최용제,두산,0.295,13,0,2800
7,김재호,두산,0.289,116,2,65000
8,안권수,두산,0.27,10,0,2700
9,박세혁,두산,0.269,97,4,23200


In [37]:
data = pd.read_excel('data/2020KBO야구.xlsx', sheet_name = ['한화', 'LG'])

In [39]:
type(data)

dict

In [40]:
data.keys()

dict_keys(['한화', 'LG'])

In [42]:
data['LG']

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,손호영,LG,0.367,11,0,2700
1,김현수,LG,0.331,181,22,130000
2,신민재,LG,0.308,8,0,5000
3,오지환,LG,0.3,158,10,60000
4,박용택,LG,0.3,65,2,80000
5,이형종,LG,0.296,85,17,20000
6,채은성,LG,0.293,122,15,32000
7,홍창기,LG,0.279,114,5,3800
8,라모스,LG,0.278,120,38,30000
9,김용의,LG,0.271,19,1,10500


In [44]:
pd.read_excel('data/2020KBO야구.xlsx', sheet_name = None)

{'두산':       선수명  팀명     타율   안타  홈런     연봉
 0   페르난데스  두산  0.340  199  21  40000
 1     허경민  두산  0.332  145   7  48000
 2     오재일  두산  0.312  147  16  47000
 3     최주환  두산  0.306  156  16  27000
 4     박건우  두산  0.304  148  14  45000
 5     정수빈  두산  0.298  146   5  34000
 6     최용제  두산  0.295   13   0   2800
 7     김재호  두산  0.289  116   2  65000
 8     안권수  두산  0.270   10   0   2700
 9     박세혁  두산  0.269   97   4  23200
 10    김재환  두산  0.266  137  30  65000
 11    조수행  두산  0.263   10   0   4500
 12    권민석  두산  0.260   13   0   2700
 13    이유찬  두산  0.258   23   0   3200
 14    신성현  두산  0.250    1   0   4800
 15    장승현  두산  0.250    5   0   3500
 16    서예일  두산  0.240    6   0   3200
 17    국해성  두산  0.233   20   3   4500
 18    오재원  두산  0.232   36   5  30000
 19    양찬열  두산  0.227    5   0   2700
 20    김인태  두산  0.202   17   1   5000
 21    백동훈  두산  0.188    3   0   3600
 22    정상호  두산  0.163   14   0   7000,
 'SK':     선수명  팀명     타율   안타  홈런        연봉
 0   김경호  SK  0.286   30   0    2900

In [45]:
야구데이터 = pd.read_excel('data/2020KBO야구.xlsx', sheet_name = None)

In [46]:
두산, SK, NC, LG, KT, KIA, 한화, 키움, 롯데, 삼성 = 야구데이터.values()

In [47]:
키움

Unnamed: 0,선수명,팀명,타율,안타,홈런,연봉
0,임지열,키움,1.0,1,0,2700
1,김은성,키움,0.5,2,0,2900
2,이정후,키움,0.333,181,15,39000
3,이지영,키움,0.309,81,0,30000
4,김하성,키움,0.306,163,30,55000
5,김혜성,키움,0.285,142,7,10000
6,서건창,키움,0.277,134,5,35000
7,김웅빈,키움,0.275,57,8,3700
8,변상권,키움,0.274,17,1,3700
9,임병욱,키움,0.27,10,0,8800


In [48]:
두산.to_excel('data/두산데이터.xlsx')