In [1]:
import numpy as np
import pandas as pd

In [11]:
data = {
    "c1": [1, 2, "누락"],
    "c2": [1.11, "", 3.33],
    "c3": ["one", "two", "three"]
} 

df_csv = pd.DataFrame(data)
df_csv

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,,two
2,누락,3.33,three


In [37]:
# to_csv() 메서드: 데이터를 csv 파일로 출력
# 첫 인자로는 파일 경로
# index 배제: default가 True이니 index=False 키워드를 활용

df_csv.to_csv("./../datas/sample1.csv", index=False) 

In [20]:
# 경로 설정: 파일 만든 위치 기반
# ./ : 현재 위치
# ../ : 바깥 파일

In [40]:
%pwd    # 경로 확인

'C:\\python\\datas'

In [35]:
%cd C:/python/datas    # 경로 바꾸기

[WinError 2] 지정된 파일을 찾을 수 없습니다: 'C:/python/datas # 경로 바꾸기'
C:\python\datas


In [39]:
# read_csv() 메서드: Pandas csv로부터 데이터 입력하기

df_read = pd.read_csv("sample1.csv")
df_read

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,,two
2,누락,3.33,three


In [41]:
# column 인덱스를 배제하고 저장

df_csv.to_csv("sample2.csv", index=False, header=False)

In [44]:
# column 인덱스 정보가 없는 경우
# read_csv()의 names 키워드 인수를 활용해서 설정

pd.read_csv("sample2.csv", names=['c1', 'c2', 'c3'])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,,two
2,누락,3.33,three


In [48]:
%%writefile sample3.txt
c1 c2 c3 c4
0.179181 -1.538472 1.347553 0.43381
1.024209 0.087307 -1.281997 0.49265
0.417899 -2.002308 0.255245 -1.10515

Overwriting sample3.txt


In [49]:
# 길이가 정해지지 않은 공백이 구분자인 경우
# ‘\s+’ 정규식(regular expression) 문자열을 사용

pd.read_table('sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


In [50]:
%%writefile sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명:
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample4.txt


In [54]:
# 건너 뛰어야 할 상단 행이 있으면 skiprows 인수를 사용

pd.read_csv('sample4.txt', skiprows=[0, 1])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [55]:
# 특정한 값을 NaN으로 취급하고 싶을 때

df_na_val = pd.read_csv('sample1.csv', na_values=['누락'])
df_na_val

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


In [59]:
# 파일을 출력할 때 sep 인수로 구분자 변경

df_na_val.to_csv('sample5.txt', sep='|')

In [57]:
df_na_val

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


In [60]:
# na_rep 키워드 인수를 사용해서 NaN 표시값 변경

df_na_val.to_csv('sample6.csv', na_rep='누락')

In [61]:
titanic = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")

In [62]:
titanic

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [63]:
titanic.describe()

Unnamed: 0,survived,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0,627.0
mean,0.38756,29.631308,0.545455,0.379585,34.385399
std,0.487582,12.511818,1.15109,0.792999,54.59773
min,0.0,0.75,0.0,0.0,0.0
25%,0.0,23.0,0.0,0.0,7.8958
50%,0.0,28.0,0.0,0.0,15.0458
75%,1.0,35.0,1.0,0.0,31.3875
max,1.0,80.0,8.0,5.0,512.3292


In [64]:
titanic.head(20)

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
5,0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
6,1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
7,1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
8,1,female,4.0,1,1,16.7,Third,G,Southampton,n
9,0,male,20.0,0,0,8.05,Third,unknown,Southampton,y
