In [40]:
import pandas as pd

# 예제 2-24 데이터프레임 칼럼 추가

In [41]:
# 특정 열을 지정하여 데이터 프레임 생성

In [42]:
movies = pd.read_csv("../data/movies.csv",usecols = ['Film','Year'])

In [43]:
movies.head()

Unnamed: 0,Film,Year
0,Zack and Miri Make a Porno,2008
1,Youth in Revolt,2010
2,You Will Meet a Tall Dark Stranger,2010
3,When in Rome,2010
4,What Happens in Vegas,2008


In [44]:
# 기본모드는 전체 행을 가져온다. 위에서는 칼럼을 두개 지정했다.

In [45]:
movies_add = pd.read_csv("../data/movies.csv")

In [46]:
movies_add.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


In [47]:
# has_seen의 열을 추가하여 모든 초기 값을 0으로 한다.

In [48]:
# df[기존에 없던 열 이름] = 초기값
# 새로운 열을 생성하고 그 열의 모든 값을 초기값으로 설정한다
movies['has_seen'] = 0

In [49]:
movies.head()

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,0
1,Youth in Revolt,2010,0
2,You Will Meet a Tall Dark Stranger,2010,0
3,When in Rome,2010,0
4,What Happens in Vegas,2008,0


In [50]:
movies_add.columns

Index(['Film', 'Genre', 'Lead Studio', 'Audience score %', 'Profitability',
       'Rotten Tomatoes %', 'Worldwide Gross', 'Year'],
      dtype='object')

In [51]:
movies_add.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


In [52]:
movies['has_seen'] = (movies_add['Audience score %'] + movies_add['Rotten Tomatoes %']) // 2

In [53]:
# 첫번째 행의 결과를 보면 (70+64)//2 => 67

In [54]:
movies.head()

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,67
1,Youth in Revolt,2010,60
2,You Will Meet a Tall Dark Stranger,2010,39
3,When in Rome,2010,29
4,What Happens in Vegas,2008,50


In [55]:
movies['has_seen'].isnull().sum()

0

In [56]:
movies_add['Worldwide Gross'].dtype # object 타입

dtype('O')

In [57]:
# Worldwide Gross 열의 모든 $값을 없앤다.
# (정규식은 사용하지 않는다.) 그리고 타입을 astype 함수를 사용해 float64로 바꾼다.

In [58]:
# 특정 문자를 제거하려 한다면 '' (<=빈문자(Empty String))으로 바꾼다
# 열의 데이터 타입이 문자열(Object)라면 .str.replace 함수를 사용할 수 있다.
# 데이터 타입을 바꾸는 것은 astype 함수로 가능하다.
# regex는 정규식을 활용할 때 사용한다
d = movies_add['Worldwide Gross'].str.replace('$','',regex = False).astype('float64')

In [59]:
d

0      41.94
1      19.62
2      26.66
3      43.04
4     219.37
       ...  
72     29.37
73     30.68
74      8.97
75    160.31
76     60.72
Name: Worldwide Gross, Length: 77, dtype: float64

In [60]:
movies.head()

Unnamed: 0,Film,Year,has_seen
0,Zack and Miri Make a Porno,2008,67
1,Youth in Revolt,2010,60
2,You Will Meet a Tall Dark Stranger,2010,39
3,When in Rome,2010,29
4,What Happens in Vegas,2008,50


# 예제 2-25 시리즈와 데이터프레임 내의 특정 원소 변경

In [61]:
movies.loc[74,'has_seen']

84

In [62]:
import numpy as np

In [63]:
movies.loc[74,'has_seen'] = np.nan

In [64]:
movies.loc[74,'has_seen']

nan

In [65]:
movies['has_seen'].isnull().sum()

1

In [66]:
movies.columns

Index(['Film', 'Year', 'has_seen'], dtype='object')

In [67]:
movies_ser_at = movies['Year']

In [68]:
movies_ser_at.head()

0    2008
1    2010
2    2010
3    2010
4    2008
Name: Year, dtype: int64

In [69]:
movies_ser_at.get(0)

2008

In [70]:
movies_ser_at.head()

0    2008
1    2010
2    2010
3    2010
4    2008
Name: Year, dtype: int64

# 예제 2-26 시리즈와 데이터프레임의 행과 열 삭제

In [71]:
movies_del =  pd.read_csv("../data/movies.csv")

In [72]:
movies_del.shape

(77, 8)

In [73]:
movies_del.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


In [74]:
# 'Rotten Tomatoes %' 열을 삭제 axis = 1 <= 삭제 대상이 열

In [76]:
movies_del1 = movies_del.drop('Rotten Tomatoes %', axis = 1)

In [77]:
movies_del1.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,$219.37,2008


In [78]:
movies_del1.shape

(77, 7)

In [79]:
# 1,2,3,4행 삭제, axis=0 <= 삭제 대상이 행

In [80]:
movies_del2 = movies_del.drop([1,2,3,4], axis = 0)

In [81]:
movies_del2.shape

(73, 8)

In [82]:
movies_del2.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
5,Water For Elephants,Drama,20th Century Fox,72,3.081421,60,$117.09,2011
6,WALL-E,Animation,Disney,89,2.896019,96,$521.28,2008
7,Waitress,Romance,Independent,67,11.089742,89,$22.18,2007
8,Waiting For Forever,Romance,Independent,53,0.005,6,$0.03,2011


In [83]:
movies_del.shape

(77, 8)

In [84]:
movies_del3 = movies_del.drop(['Lead Studio','Worldwide Gross'], axis = 1)

In [85]:
movies_del3.shape

(77, 6)

In [86]:
movies_del3.head()

Unnamed: 0,Film,Genre,Audience score %,Profitability,Rotten Tomatoes %,Year
0,Zack and Miri Make a Porno,Romance,70,1.747542,64,2008
1,Youth in Revolt,Comedy,52,1.09,68,2010
2,You Will Meet a Tall Dark Stranger,Comedy,35,1.211818,43,2010
3,When in Rome,Comedy,44,0.0,15,2010
4,What Happens in Vegas,Comedy,72,6.267647,28,2008


In [87]:
movies_del.drop(['Lead Studio','Worldwide Gross'], axis = 1, inplace = True)

In [88]:
movies_del.head()

Unnamed: 0,Film,Genre,Audience score %,Profitability,Rotten Tomatoes %,Year
0,Zack and Miri Make a Porno,Romance,70,1.747542,64,2008
1,Youth in Revolt,Comedy,52,1.09,68,2010
2,You Will Meet a Tall Dark Stranger,Comedy,35,1.211818,43,2010
3,When in Rome,Comedy,44,0.0,15,2010
4,What Happens in Vegas,Comedy,72,6.267647,28,2008


In [89]:
movies_ser = movies_del['Genre']

In [90]:
movies_ser.head()

0    Romance
1     Comedy
2     Comedy
3     Comedy
4     Comedy
Name: Genre, dtype: object

In [91]:
movies_ser.drop([1,2,3,4],inplace=True)

In [92]:
movies_ser.head()

0      Romance
5        Drama
6    Animation
7      Romance
8      Romance
Name: Genre, dtype: object

# 예제 2-27 문자열 조회

In [93]:
movies_str = pd.read_csv("../data/korea_movie_list.csv", encoding='ms949')

In [94]:
movies_str.columns

Index(['movie_code', 'title', 'title_Eng', 'show_time', 'produce_year',
       'open_date', 'produce_state', 'type', 'nation', 'genre', 'director',
       'actor', 'show_type', 'watch_grade'],
      dtype='object')

In [95]:
movies_str.shape

(2827, 14)

In [96]:
movies_str.head(2)

Unnamed: 0,movie_code,title,title_Eng,show_time,produce_year,open_date,produce_state,type,nation,genre,director,actor,show_type,watch_grade
0,20185801,할로우 차일드,The Hollow Child,88.0,2017,20180802,개봉예정,장편,캐나다,공포(호러)/판타지,제레미 루터,,,15세이상관람가
1,20187649,죽음의 리무진,Glass Coffin,75.0,2016,20180816,개봉예정,장편,스페인,스릴러/공포(호러),하리츠 쥬빌라가,파울라 본템피,,


In [97]:
movies_str_ser = movies_str['title']

In [98]:
movies_str_ser.head()

0              할로우 차일드
1              죽음의 리무진
2    극장판 도라에몽: 진구의 보물섬
3     명탐정 코난 : 제로의 집행인
4              살아남은 아이
Name: title, dtype: object

In [102]:
movies_str_eng = movies_str['title_Eng']

In [103]:
movies_str_eng.head()

0                      The Hollow Child
1                          Glass Coffin
2                                   NaN
3    Detective Conan: Zero the Enforcer
4                            Last Child
Name: title_Eng, dtype: object

In [100]:
# hasnans => has nans의 의미임. 누락된 값이 있으면 True 아니면 False

In [101]:
movies_str_ser.hasnans

False

In [104]:
movies_str_eng.hasnans

True

In [105]:
movies_str_eng.shape

(2827,)

In [106]:
movies_str_eng = movies_str_eng.dropna()

In [107]:
movies_str_eng.shape

(2226,)

In [108]:
s = str('가을')

In [109]:
len(s)

2

In [110]:
# 시리즈의 str 속성의 len 함수를 통해 전체 요소의 길이값 반환

In [111]:
movies_str_ser.str.len().head()

0     7
1     7
2    17
3    16
4     7
Name: title, dtype: int64

In [112]:
s.find('을')

1

In [113]:
movies_str_ser.str.find('할로우').head()

0    0
1   -1
2   -1
3   -1
4   -1
Name: title, dtype: int64

In [114]:
s.count('을')

1

In [115]:
movies_str_ser.str.count('의').head()

0    0
1    1
2    1
3    1
4    0
Name: title, dtype: int64

# 예제 2-28 문자열 변경

In [116]:
si = str(' 찬혁 ')

In [117]:
si

' 찬혁 '

In [118]:
len(si)

4

In [119]:
si = si.strip()

In [120]:
len(si)

2

In [122]:
idx = pd.Index([' 찬혁', '런던 ', ' 소셜 ','겨울'])

In [123]:
idx.str.strip()

Index(['찬혁', '런던', '소셜', '겨울'], dtype='object')

In [125]:
idx.str.lstrip()

Index(['찬혁', '런던 ', '소셜 ', '겨울'], dtype='object')

In [126]:
idx.str.rstrip()

Index([' 찬혁', '런던', ' 소셜', '겨울'], dtype='object')

In [127]:
import numpy as np

In [128]:
ss = pd.Series(['가_나_다', '라_마_바', np.nan, '사_아_자'])

In [129]:
ss.str.split('_')

0    [가, 나, 다]
1    [라, 마, 바]
2          NaN
3    [사, 아, 자]
dtype: object

In [130]:
ss.str.split('_').str.get(1)

0      나
1      마
2    NaN
3      아
dtype: object

In [131]:
ss.str.split('_').str[1]

0      나
1      마
2    NaN
3      아
dtype: object

In [132]:
# expand = True를 지정하면 분리된 것을 기준으로 데이터프레임을 만들어 준다.

In [133]:
ss.str.split('_', expand = True)

Unnamed: 0,0,1,2
0,가,나,다
1,라,마,바
2,,,
3,사,아,자


In [134]:
ss.str.split('_', expand = True, n = 1)

Unnamed: 0,0,1
0,가,나_다
1,라,마_바
2,,
3,사,아_자


In [135]:
ss.str.rsplit('_', expand = True, n = 1)

Unnamed: 0,0,1
0,가_나,다
1,라_마,바
2,,
3,사_아,자


In [136]:
ss.str.replace('_','&')

0    가&나&다
1    라&마&바
2      NaN
3    사&아&자
dtype: object

In [137]:
ss.str.cat(sep = ',')

'가_나_다,라_마_바,사_아_자'

In [138]:
ss.str.cat()

'가_나_다라_마_바사_아_자'

In [139]:
ss.str.cat(['A','B','C','D'])

0    가_나_다A
1    라_마_바B
2       NaN
3    사_아_자D
dtype: object

In [140]:
su = pd.Series(['A','B','C','D'])

In [141]:
ss.str.cat(su)

0    가_나_다A
1    라_마_바B
2       NaN
3    사_아_자D
dtype: object

In [142]:
ss.str[0]

0      가
1      라
2    NaN
3      사
dtype: object

In [143]:
ss.str[1]

0      _
1      _
2    NaN
3      _
dtype: object

In [144]:
ss.str[2]

0      나
1      마
2    NaN
3      아
dtype: object