# CSV 파일 읽고 쓰기

csv는 Comma Seperated Values의 약자로, 말 그대로 콤마(,)로 구분된 데이터들을 말합니다. [위키피디아](https://en.wikipedia.org/wiki/Comma-separated_values)에서 예제를 확인할 수 있습니다. 엄청 심플한 버젼의 엑셀 포맷이라고 보셔도 무방합니다! 실제로 *.csv 파일은 엑셀(윈도우)이나 넘버스(맥)으로 열 수 있습니다.

In [1]:
# csv 파일로 바꿔봅시다. 
import csv

# https://docs.python.org/2/library/csv.html#csv.reader
# delimiter, qoutechar, qouting 옵션이 뭔지 직접 해봅시다.
with open('sample.csv', 'w') as csvfile:
    # default delimiter는 , quotechar는 "입니다.
    writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    writer.writerow(['검사외전', '범죄', '126분'])
    # 아래와 같이 제목에 콤마(,)가 들어가면 delimiter인 콤마(,)와 헷갈리게 됩니다.
    # 그래서 quotechar(|)로 제목을 감싸주게 됩니다.
    # 직접 확인해보죠.
    writer.writerow(['쿵푸팬더3 (Kung Fu Panda 3, 2016)', '애니메이션', '95분'])

In [2]:
# csv 파일을 읽어옵니다.
data = []
with open('sample.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in reader:
        data.append(row)
        
print data

[['\xea\xb2\x80\xec\x82\xac\xec\x99\xb8\xec\xa0\x84', '\xeb\xb2\x94\xec\xa3\x84', '126\xeb\xb6\x84'], ['\xec\xbf\xb5\xed\x91\xb8\xed\x8c\xac\xeb\x8d\x943 (Kung Fu Panda 3, 2016)', '\xec\x95\xa0\xeb\x8b\x88\xeb\xa9\x94\xec\x9d\xb4\xec\x85\x98', '95\xeb\xb6\x84']]


## 해보기

이번에는 delimiter를 세미콜론(';')으로 하고, quotechar는 '/'로 해봅시다. 여러줄을 한 번에 쓸때는 `writer.writerows` 하면 됩니다.

In [40]:
data = [['검;사;외;전', '범죄', '126분'], \
        ['쿵;푸;팬;더;3 (Kung Fu Panda 3, 2016)', \
         '애;니;메;이;션', '95분']]

def write_csv(data, filename, delimiter=',', quotechar='"'):
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile, \
                            delimiter=delimiter,\
                            quotechar=quotechar)

        writer.writerows(data)

    return filename

In [41]:
write_csv(data,'movies_sample.csv')

'movies_sample.csv'

## 해보기

In [15]:
def read_csv(filename, delimiter=',', quotechar='"'):
    data = []

    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, \
                            delimiter=delimiter,\
                            quotechar=quotechar)

        for row in reader:
            data.append(row)

    return data

In [16]:
data = read_csv('movies_sample.csv')

In [17]:
for row in data:
    for item in row:
        print item

검;사;외;전
범죄
126분
쿵;푸;팬;더;3 (Kung Fu Panda 3, 2016)
애;니;메;이;션
95분


# 지난주에 배운 movie_total을 csv로 저장해봅시다.

In [42]:
#-*- coding: utf-8 -*-

import csv
from bs4 import BeautifulSoup as bs
import requests

def movie_crawler(url):
    res = requests.get(url)

    table = []

    if res.status_code == 200:
        soup = bs(res.text)

        movies = soup.findAll('div', {'itemtype': 'http://schema.org/Movie'})

        print len(movies)

        for movie in movies:
            row = []

            title = movie.findAll('h4', {'itemprop': 'name'})
            score = movie.select('div.rating_txt > div > strong')
            genres = movie.findAll('span', {'itemprop': 'genre'})
            running_time = movie.findAll('time', {'itemprop': 'duration'})
            actors = movie.findAll('span', {'itemprop': 'actors'})

            title =  title[0].text.strip() if len(title)>0 else ""
            score = score[0].text.strip() if len(score)>0 else ""
            genre = "/".join([genre.text.strip() for genre in genres])
            running_time = running_time[0].text.strip() if len(running_time)>0 else ""
            actor = "/".join([actor.text.strip() for actor in actors])

            row = [title, score, genre, running_time, actor]
            table.append(row)

    return table


data = movie_crawler('http://www.imdb.com/movies-coming-soon/2015-01')

32


'movie_info.csv'

In [43]:
# 그러면 이제 2015-01부터 2015-12까지 영화정보를 긁어봅시다.
target_url = 'http://www.imdb.com/movies-coming-soon/{0}'
movie_total = []
for i in range(1,13):
    # string.zfill(2)을 사용해보세요. zero padding이 생깁니다.
    date = "2015-" + str(i).zfill(2)
    print target_url.format(date) + " crawling.."
    movie_total += movie_crawler(target_url.format(date))

http://www.imdb.com/movies-coming-soon/2015-01 crawling..
32
http://www.imdb.com/movies-coming-soon/2015-02 crawling..
23
http://www.imdb.com/movies-coming-soon/2015-03 crawling..
27
http://www.imdb.com/movies-coming-soon/2015-04 crawling..
29
http://www.imdb.com/movies-coming-soon/2015-05 crawling..
39
http://www.imdb.com/movies-coming-soon/2015-06 crawling..
31
http://www.imdb.com/movies-coming-soon/2015-07 crawling..
38
http://www.imdb.com/movies-coming-soon/2015-08 crawling..
32
http://www.imdb.com/movies-coming-soon/2015-09 crawling..
32
http://www.imdb.com/movies-coming-soon/2015-10 crawling..
43
http://www.imdb.com/movies-coming-soon/2015-11 crawling..
30
http://www.imdb.com/movies-coming-soon/2015-12 crawling..
33


In [45]:
def write_csv(data, filename, delimiter=",", quotechar='"'):
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile, \
                            delimiter=delimiter,\
                            quotechar=quotechar)

        writer.writerows([[item.encode('utf-8') for item in row] for row in data])

        # [[item.encode('utf-8') for item in row] for row in data]

    return filename

In [44]:
write_csv(movie_total, "movie_info.csv")

'movie_info.csv'

In [48]:
# 자, 이제 파이썬 데이터 분석툴 pandas 소개합니다.
from pandas import DataFrame

In [50]:
movie_df = DataFrame(movie_total, \
                     columns=['title', 'scorr', 'genre', 'running time', 'actors'])

In [51]:
len(movie_df)

389

In [52]:
# csv format으로 저장하기 (encoding utf-8)
movie_df.to_csv('./movie_from_df.csv', encoding='utf-8', index=False) #, header=False)

In [53]:
import pandas as pd
movie_df = pd.read_csv('./movie_from_df.csv')
movie_df.head()