# Numpy

## 리스트
 - 여러개의 요소를 담는 가변적인 표준컨테이너
 - 서로 다른 데이터 타입의 요소를 담는 리스트 생성 가능
 - 단점: 행렬연산등 처리의 어려움

In [23]:
L = list(range(10))
L

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [24]:
type(L[0])

int

In [25]:
L2 = [True, '2', 3.0, 4]
L2

[True, '2', 3.0, 4]

In [26]:
[type(item) for item in L2]

[bool, str, float, int]

In [27]:
for item in L2:
    print(type(item))

<class 'bool'>
<class 'str'>
<class 'float'>
<class 'int'>


### NumPy 패키지를 사용하여 행렬 연산 및 처리가 가능한 배열(Array) 생성
 - NumPy는 배열의 모든 요소가 같은 타입
 - 타입이 일치하지 않으면 NumPy는 가능한 경우 상위타입을 취함
 - dypte키워드를 사용하여 데이터 타입 설정도 가능
 - 리스트의 중첩을 통해 다차원배열 생성 가능

In [28]:
import numpy as np

In [29]:
np.array([1,4,5,3])

array([1, 4, 5, 3])

In [30]:
np.array([3.14,4,2,3])

array([3.14, 4.  , 2.  , 3.  ])

In [31]:
np.array([1,2,3,4], dtype='float32')  # 타입을 임의로 변경시킨 경우

array([1., 2., 3., 4.], dtype=float32)

In [32]:
np.array([[2,3,4],[5,6,7],[8,9,'10']])  # 타입이 일치하지 않은 경우

array([['2', '3', '4'],
       ['5', '6', '7'],
       ['8', '9', '10']], dtype='<U21')

In [33]:
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [34]:
np.zeros((3,3))  # default 값은 실수로 적용

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [35]:
np.ones((3,5), dtype=int)

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [36]:
np.full((3,5), 3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [37]:
np.arange(0, 20, 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [38]:
np.linspace(0, 1, 5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [39]:
np.random.random((10))

array([0.56097624, 0.93067846, 0.62577174, 0.88956108, 0.06544052,
       0.65834818, 0.49018486, 0.58299226, 0.87614619, 0.06233049])

In [40]:
np.random.random((3,3))

array([[0.07259404, 0.74707474, 0.27158441],
       [0.61673387, 0.47574257, 0.33165424],
       [0.15724527, 0.83609251, 0.36577762]])

In [41]:
# 정규분포(평균=0, 표준편차=1)의 난수로 구성된 3*3배열)
np.random.normal(0,1,(3,3))

array([[-1.56418587, -0.26233579,  0.97362653],
       [-0.56296579,  1.91899719, -1.19988653],
       [-1.56136357, -0.63636113,  1.39476111]])

In [42]:
np.random.randint(0,10,(3,3))

array([[7, 8, 2],
       [5, 3, 5],
       [3, 5, 0]])

### NumPy 내장 루틴 : 사전 정의된 배열 사용

In [43]:
# 3*3 단위행렬
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [44]:
np.empty(3)

array([1., 1., 1.])

In [45]:
# NumPy 데이터 타입
# bool : 1바이트로 저장된 bool 값 (참 또는 거짓)
# int : int64 또는 int32
# Int8 : 바이트(-128~127)
# int16 : 정수(-32768~32767)
# int32 : 정수(-2147483648~2147483647)
# int64 : 정수(-9223372036854775808~9223372036854775807)
# unit8 : 부호없는정수(0~255)
# unit16 : 부호없는정수(0~65535)
# unit32 : 부호없는정수(0~4294967295)
# unit64 : 부호없는정수(0~18446744073709551615)
# float : float64
# float16 : 반정밀부동소수점,5비트지수,10비트가수
# float32 : 단정밀부동소수점,8비트지수,32비트가수
# float64 : 배정밀부동소수점,11비트지수,53비트가수
# complex : complex128
# complex64 : 복소수,두개의32비트부동소수점으로표현
# complex128 : 복소수,두개의64비트부동소수점으로표현

In [46]:
np.random.seed(123)  # random 으로 뽑은 수를 고정시켜준다.
x1 = np.random.randint(10, size=6)
x2 = np.random.randint(10, size=(3,4))
x3 = np.random.randint(10, size=(3,4,5))
x3

array([[[4, 1, 7, 3, 2],
        [4, 7, 2, 4, 8],
        [0, 7, 9, 3, 4],
        [6, 1, 5, 6, 2]],

       [[1, 8, 3, 5, 0],
        [2, 6, 2, 4, 4],
        [6, 3, 0, 6, 4],
        [7, 6, 7, 1, 5]],

       [[7, 9, 2, 4, 8],
        [1, 2, 1, 1, 3],
        [5, 9, 0, 8, 1],
        [6, 3, 3, 5, 9]]])

In [47]:
print(x3.ndim)  # 수열의 차원의 수를 알려준다.
print(x3.shape)  # 설정한 각각의 차원에 들어간 값을 보여줌
print(x3.size)
print(x3.dtype)
print(x3.itemsize)
print(x3.nbytes)

3
(3, 4, 5)
60
int64
8
480


In [48]:
x1

array([2, 2, 6, 1, 3, 9])

In [49]:
x1[-1]

9

In [50]:
x2

array([[6, 1, 0, 1],
       [9, 0, 0, 9],
       [3, 4, 0, 0]])

In [51]:
x2[1,3]

9

In [52]:
x2[1,-1]

9

In [53]:
# 배열 슬라이싱
x = np.arange(10)

In [54]:
print(x)
print(x[5:])
print(x[4:7])
print(x[0:2])
print(x[::2])
print(x[1::2])
print(x[::-1])
print(x[5::-2])

[0 1 2 3 4 5 6 7 8 9]
[5 6 7 8 9]
[4 5 6]
[0 1]
[0 2 4 6 8]
[1 3 5 7 9]
[9 8 7 6 5 4 3 2 1 0]
[5 3 1]


In [55]:
x2

array([[6, 1, 0, 1],
       [9, 0, 0, 9],
       [3, 4, 0, 0]])

In [56]:
x2[:2, :3]

array([[6, 1, 0],
       [9, 0, 0]])

In [57]:
x2[1:, 2:]

array([[0, 9],
       [0, 0]])

In [58]:
x2[:, 0]

array([6, 9, 3])

In [59]:
x2_sub = x2[:2, :2]
x2_sub

array([[6, 1],
       [9, 0]])

In [60]:
x2_sub[0,0] = 99
x2_sub

array([[99,  1],
       [ 9,  0]])

In [61]:
x2

array([[99,  1,  0,  1],
       [ 9,  0,  0,  9],
       [ 3,  4,  0,  0]])

In [62]:
x2_sub[0,0] = 6
x2

array([[6, 1, 0, 1],
       [9, 0, 0, 9],
       [3, 4, 0, 0]])

In [63]:
x2

array([[6, 1, 0, 1],
       [9, 0, 0, 9],
       [3, 4, 0, 0]])

In [64]:
x2_sub = x2[:2, :2].copy()
x2_sub

array([[6, 1],
       [9, 0]])

In [65]:
x2_sub[0,0] = 99
x2_sub

array([[99,  1],
       [ 9,  0]])

In [66]:
x2

array([[6, 1, 0, 1],
       [9, 0, 0, 9],
       [3, 4, 0, 0]])

### 배열 재구조화(형상 변경)
 - reshape() 메소드를 사용
 - 1차원 배열을 2차원 행이나 열 매트릭스로 전환하는 경우에는 슬라이스 연산 내에 newaxis 키워드 사용

In [67]:
grid = np.arange(1, 10)
x = np.array([1,2,3])
print(grid)
print(x)

[1 2 3 4 5 6 7 8 9]
[1 2 3]


In [68]:
x.reshape(3,1)

array([[1],
       [2],
       [3]])

In [69]:
x[:, np.newaxis]

array([[1],
       [2],
       [3]])

In [70]:
grid.reshape(3,3)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [71]:
grid[:, np.newaxis]

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [72]:
# 배열 결합
x = np.array([1,2,3])
y = np.array([3,2,1])
np.concatenate([x,y])

array([1, 2, 3, 3, 2, 1])

In [73]:
z = [99,99,99]
np.concatenate([x,y,z])

array([ 1,  2,  3,  3,  2,  1, 99, 99, 99])

In [74]:
grid = np.array([[1,2,3],[4,5,6]])
np.concatenate([grid,grid])

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [75]:
np.concatenate([grid,grid], axis=1)  # axis = 0 은 세로축, axis = 1 은 가로축

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [76]:
# 수직 쌓기
x = np.array([1,2,3])
grid = np.array([[9,8,7],[6,5,4]])
np.vstack([x,grid])

array([[1, 2, 3],
       [9, 8, 7],
       [6, 5, 4]])

In [77]:
# 수평 쌓기
y = np.array([[99],[99]])
np.hstack([grid,y])

array([[ 9,  8,  7, 99],
       [ 6,  5,  4, 99]])

In [78]:
x = [1,2,3,99,99,3,2,1]
x1,x2,x3 = np.split(x, [3,5])
print(x1,x2,x3)

[1 2 3] [99 99] [3 2 1]


In [79]:
grid = np.arange(1,17).reshape((4,4))
grid

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]])

In [80]:
Upp,Low = np.vsplit(grid,[2])
print(Upp)
print(Low)

[[1 2 3 4]
 [5 6 7 8]]
[[ 9 10 11 12]
 [13 14 15 16]]


In [81]:
Left,Right = np.hsplit(grid,[2])
print(Left)
print(Right)

[[ 1  2]
 [ 5  6]
 [ 9 10]
 [13 14]]
[[ 3  4]
 [ 7  8]
 [11 12]
 [15 16]]


## 출력 지정

In [82]:
x = np.arange(5)
y = np.empty(5)
np.multiply(x,10, out=y)

array([ 0., 10., 20., 30., 40.])

In [83]:
x

array([0, 1, 2, 3, 4])

In [84]:
y = np.zeros(5)
np.power(2,x, out=y)

array([ 1.,  2.,  4.,  8., 16.])

## 집계

In [85]:
x = np.arange(1,11)
np.add.reduce(x)

55

In [86]:
np.multiply.reduce(x)

3628800

In [87]:
np.add.accumulate(x)

array([ 1,  3,  6, 10, 15, 21, 28, 36, 45, 55])

In [88]:
np.multiply.accumulate(x)

array([      1,       2,       6,      24,     120,     720,    5040,
         40320,  362880, 3628800])

## Pandas
### 데이터 프레임의 이해

In [89]:
import pandas as pd

In [90]:
movie = pd.read_csv('data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [91]:
index = movie.index
index

RangeIndex(start=0, stop=4916, step=1)

In [92]:
columns = movie.columns
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [93]:
data = movie.values

In [94]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915])

In [95]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

In [96]:
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [97]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

## Series

In [98]:
# Series 메서드
"""
head() : 데이터일부를표시
value_counts() : 고유값의빈도수계산(Series만지원) / normalize=True로설정하면상대빈도
size, shape, len(x) : 원소개수계산
count():결측치를제외한개수
min(), max(), mean(), median(), std(), sum()
describe() : 요약통계량제공(개수, 평균, 표준편차, 최소값, 4분위수, 최대값)
quantile() : 정확한분위수계산
isnull():결측치체크<-> notnull()
fillna() : 결측치를한번에특정값으로변환
dropna() : 결측치제거
hasnans: 하나라도결측치가있으면True (Series만지원)
astype() : 데이터형식변경(결측치가있는경우오류)
"""

'\nhead() : 데이터일부를표시\nvalue_counts() : 고유값의빈도수계산(Series만지원) / normalize=True로설정하면상대빈도\uf0a7\nsize, shape, len(x) : 원소개수계산\uf0a7\ncount():결측치를제외한개수\nmin(), max(), mean(), median(), std(), sum()\ndescribe() : 요약통계량제공(개수, 평균, 표준편차, 최소값, 4분위수, 최대값)\nquantile() : 정확한분위수계산\nisnull():결측치체크<-> notnull()\nfillna() : 결측치를한번에특정값으로변환\ndropna() : 결측치제거\nhasnans: 하나라도결측치가있으면True (Series만지원)\nastype() : 데이터형식변경(결측치가있는경우오류)\n'

In [99]:
director = movie['director_name']
director.head(20)

0         James Cameron
1        Gore Verbinski
2            Sam Mendes
3     Christopher Nolan
4           Doug Walker
5        Andrew Stanton
6             Sam Raimi
7          Nathan Greno
8           Joss Whedon
9           David Yates
10          Zack Snyder
11         Bryan Singer
12         Marc Forster
13       Gore Verbinski
14       Gore Verbinski
15          Zack Snyder
16       Andrew Adamson
17          Joss Whedon
18         Rob Marshall
19     Barry Sonnenfeld
Name: director_name, dtype: object

In [100]:
score = movie['imdb_score']
score.head()

0    7.9
1    7.1
2    6.8
3    8.5
4    7.1
Name: imdb_score, dtype: float64

In [101]:
## 반드시 기억! 중요한 함수
director.value_counts()   # 반복되는 value 를 모아서 출력해 줌

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Spike Lee           16
                    ..
Robert Stevenson     1
Måns Mårlind         1
Rebecca Miller       1
Nanette Burstein     1
Ritesh Batra         1
Name: director_name, Length: 2397, dtype: int64

In [102]:
director.value_counts(normalize=True)

Steven Spielberg    0.005401
Woody Allen         0.004570
Clint Eastwood      0.004155
Martin Scorsese     0.004155
Spike Lee           0.003324
                      ...   
Robert Stevenson    0.000208
Måns Mårlind        0.000208
Rebecca Miller      0.000208
Nanette Burstein    0.000208
Ritesh Batra        0.000208
Name: director_name, Length: 2397, dtype: float64

In [103]:
director.size

4916

In [104]:
director.shape

(4916,)

In [105]:
movie.shape

(4916, 28)

In [106]:
len(director)

4916

In [107]:
director.count()

4814

In [108]:
score.describe()

count    4916.000000
mean        6.437429
std         1.127802
min         1.600000
25%         5.800000
50%         6.600000
75%         7.200000
max         9.500000
Name: imdb_score, dtype: float64

In [109]:
# 분위수
score.quantile(.2)

5.6

In [110]:
s = pd.Series([1,2,3,4])
s.quantile(.5)

2.5

In [111]:
s.quantile([.25,.5,.75])

0.25    1.75
0.50    2.50
0.75    3.25
dtype: float64

In [112]:
director.isnull().sum()

102

In [113]:
director.count()  # null 이 아닌 값

4814

In [114]:
director.size  # null 값 포함

4916

In [115]:
director_fill = director.fillna('Unknown')
director_fill.count()

4916

In [151]:
director_drop = director.dropna()
director_drop.size

4814

In [117]:
# NaN 값 파악
director.hasnans

True

In [118]:
score.dtype

dtype('float64')

In [119]:
score.astype(int)  # 숫자의 형태를 바꾸는 함수

0       7
1       7
2       6
3       8
4       7
       ..
4911    7
4912    7
4913    6
4914    6
4915    6
Name: imdb_score, Length: 4916, dtype: int64

## 메서드 체인

In [120]:
director.value_counts().head()

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Spike Lee           16
Name: director_name, dtype: int64

In [121]:
score.fillna(0).astype(int).head()

0    7
1    7
2    6
3    8
4    7
Name: imdb_score, dtype: int64

## 열과 행 다루기

In [122]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [123]:
movie.movie_title.head()

0                                        Avatar
1      Pirates of the Caribbean: At World's End
2                                       Spectre
3                         The Dark Knight Rises
4    Star Wars: Episode VII - The Force Awakens
Name: movie_title, dtype: object

In [124]:
movie2 = movie.set_index('movie_title')
movie2.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [125]:
movie2 = pd.read_csv('data/movie.csv', index_col='movie_title')
movie2.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [126]:
movie2 = movie2.reset_index()
movie2.head()

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0


In [155]:
import pandas as pd

In [128]:
movie2_rename = movie2.rename(index={0:'아바타'}, columns={'color':'색깔', 'director_name':'감독'})
movie2_rename.head()

Unnamed: 0,movie_title,색깔,감독,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
아바타,Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0


In [129]:
# 컬럼을 리스트로 생성
col = movie2.columns
col_list = col.tolist()
col_list

['movie_title',
 'color',
 'director_name',
 'num_critic_for_reviews',
 'duration',
 'director_facebook_likes',
 'actor_3_facebook_likes',
 'actor_2_name',
 'actor_1_facebook_likes',
 'gross',
 'genres',
 'actor_1_name',
 'num_voted_users',
 'cast_total_facebook_likes',
 'actor_3_name',
 'facenumber_in_poster',
 'plot_keywords',
 'movie_imdb_link',
 'num_user_for_reviews',
 'language',
 'country',
 'content_rating',
 'budget',
 'title_year',
 'actor_2_facebook_likes',
 'imdb_score',
 'aspect_ratio',
 'movie_facebook_likes']

In [130]:
col_list[0] = '색깔1'
col_list[1] = '감독1'

In [131]:
col_list

['색깔1',
 '감독1',
 'director_name',
 'num_critic_for_reviews',
 'duration',
 'director_facebook_likes',
 'actor_3_facebook_likes',
 'actor_2_name',
 'actor_1_facebook_likes',
 'gross',
 'genres',
 'actor_1_name',
 'num_voted_users',
 'cast_total_facebook_likes',
 'actor_3_name',
 'facenumber_in_poster',
 'plot_keywords',
 'movie_imdb_link',
 'num_user_for_reviews',
 'language',
 'country',
 'content_rating',
 'budget',
 'title_year',
 'actor_2_facebook_likes',
 'imdb_score',
 'aspect_ratio',
 'movie_facebook_likes']

In [132]:
movie.columns = col_list
movie.head()

Unnamed: 0,색깔1,감독1,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [133]:
movie.drop(range(10), axis='index')

Unnamed: 0,색깔1,감독1,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
10,Color,Zack Snyder,673.0,183.0,0.0,2000.0,Lauren Cohan,15000.0,330249062.0,Action|Adventure|Sci-Fi,...,3018.0,English,USA,PG-13,250000000.0,2016.0,4000.0,6.9,2.35,197000
11,Color,Bryan Singer,434.0,169.0,0.0,903.0,Marlon Brando,18000.0,200069408.0,Action|Adventure|Sci-Fi,...,2367.0,English,USA,PG-13,209000000.0,2006.0,10000.0,6.1,2.35,0
12,Color,Marc Forster,403.0,106.0,395.0,393.0,Mathieu Amalric,451.0,168368427.0,Action|Adventure,...,1243.0,English,UK,PG-13,200000000.0,2008.0,412.0,6.7,2.35,0
13,Color,Gore Verbinski,313.0,151.0,563.0,1000.0,Orlando Bloom,40000.0,423032628.0,Action|Adventure|Fantasy,...,1832.0,English,USA,PG-13,225000000.0,2006.0,5000.0,7.3,2.35,5000
14,Color,Gore Verbinski,450.0,150.0,563.0,1000.0,Ruth Wilson,40000.0,89289910.0,Action|Adventure|Western,...,711.0,English,USA,PG-13,215000000.0,2013.0,2000.0,6.5,2.35,48000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [141]:
movie.drop(['색깔1','감독1'], axis='columns')

Unnamed: 0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,Rob Walker,131.0,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,Eric Mabius,Signed Sealed Delivered,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,Natalie Zea,The Following,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,Eva Boehnke,A Plague So Pleasant,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,Alan Ruck,Shanghai Calling,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [143]:
# 컬럼 위치
avg_index = movie.columns.get_loc('duration') + 1

In [147]:
movie.insert(loc=avg_index, column='평균리뷰', value=movie['num_critic_for_reviews']/movie['duration'])
movie

Unnamed: 0,색깔1,감독1,director_name,num_critic_for_reviews,duration,평균리뷰,평균리뷰수,director_facebook_likes,actor_3_facebook_likes,actor_2_name,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,inf,inf,855.0,Joel David Moore,1000.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,0.300178,0.300178,1000.0,Orlando Bloom,40000.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,inf,inf,161.0,Rory Kinnear,11000.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,0.007455,0.007455,23000.0,Christian Bale,27000.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,,,Rob Walker,131.0,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,43.500000,43.500000,318.0,Daphne Zuniga,637.0,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,,,319.0,Valorie Curry,841.0,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,inf,inf,0.0,Maxwell Moody,0.0,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,inf,inf,489.0,Daniel Henney,946.0,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [148]:
movie.drop(['평균리뷰'], axis='columns')

Unnamed: 0,색깔1,감독1,director_name,num_critic_for_reviews,duration,평균리뷰수,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,inf,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,0.300178,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,inf,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,0.007455,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,43.500000,318.0,Daphne Zuniga,637.0,,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,,319.0,Valorie Curry,841.0,,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,inf,0.0,Maxwell Moody,0.0,,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,inf,489.0,Daniel Henney,946.0,10443.0,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [149]:
movie[['감독1']]

Unnamed: 0,감독1
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker
...,...
4911,Scott Smith
4912,
4913,Benjamin Roberds
4914,Daniel Hsia


In [152]:
# 과제 데이터 프레임의 이해
# series
# 메서드체인
# 열과행다루기
# flight, songs, breast_cancer

## 1. DataFrame 열 선택

### 1.1. 리스트를 사용한 열 선택

In [1]:
import pandas as pd

In [3]:
movie = pd.read_csv('data/movie.csv')
movie

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [162]:
movie_actor = movie[['actor_1_name','actor_2_name','actor_3_name']]
movie_actor.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name
0,CCH Pounder,Joel David Moore,Wes Studi
1,Johnny Depp,Orlando Bloom,Jack Davenport
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
4,Doug Walker,Rob Walker,


In [164]:
movie[['director_name']].head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


In [165]:
cols = ['actor_1_name','actor_2_name','actor_3_name']
movie[cols].head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name
0,CCH Pounder,Joel David Moore,Wes Studi
1,Johnny Depp,Orlando Bloom,Jack Davenport
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
4,Doug Walker,Rob Walker,


### 1.2. 메서드를 사용한 열 선택

In [167]:
movie.get_dtype_counts() # 각 특정 데이터 형식의 열 개수 출력

AttributeError: 'DataFrame' object has no attribute 'get_dtype_counts'

In [169]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

In [168]:
movie.select_dtypes(include=['int64']).head()

Unnamed: 0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [170]:
movie.select_dtypes(include=['number']).head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [171]:
movie.filter(like='facebook').head()

Unnamed: 0,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,actor_2_facebook_likes,movie_facebook_likes
0,0.0,855.0,1000.0,4834,936.0,33000
1,563.0,1000.0,40000.0,48350,5000.0,0
2,0.0,161.0,11000.0,11700,393.0,85000
3,22000.0,23000.0,27000.0,106759,23000.0,164000
4,131.0,,131.0,143,12.0,0


In [172]:
movie.filter(regex='\d').head()

Unnamed: 0,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,actor_3_name,actor_2_facebook_likes
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0


## 2. 전체 DataFrame 연산

### 2.1. Series 속성과 메서드 사용

In [175]:
movie.shape

(4916, 28)

In [176]:
movie.size

137648

In [177]:
movie.ndim

2

In [179]:
movie.count()

color                        4897
director_name                4814
num_critic_for_reviews       4867
duration                     4901
director_facebook_likes      4814
actor_3_facebook_likes       4893
actor_2_name                 4903
actor_1_facebook_likes       4909
gross                        4054
genres                       4916
actor_1_name                 4909
movie_title                  4916
num_voted_users              4916
cast_total_facebook_likes    4916
actor_3_name                 4893
facenumber_in_poster         4903
plot_keywords                4764
movie_imdb_link              4916
num_user_for_reviews         4895
language                     4904
country                      4911
content_rating               4616
budget                       4432
title_year                   4810
actor_2_facebook_likes       4903
imdb_score                   4916
aspect_ratio                 4590
movie_facebook_likes         4916
dtype: int64

In [180]:
movie.min()  # 최소값

num_critic_for_reviews                                                       1
duration                                                                     7
director_facebook_likes                                                      0
actor_3_facebook_likes                                                       0
actor_1_facebook_likes                                                       0
gross                                                                      162
genres                                                                  Action
movie_title                                                            #Horror
num_voted_users                                                              5
cast_total_facebook_likes                                                    0
facenumber_in_poster                                                         0
movie_imdb_link              http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user_for_reviews                                

In [181]:
movie.max()  # 최대값

num_critic_for_reviews                                                     813
duration                                                                   511
director_facebook_likes                                                  23000
actor_3_facebook_likes                                                   23000
actor_1_facebook_likes                                                  640000
gross                                                              7.60506e+08
genres                                                                 Western
movie_title                                                           Æon Flux
num_voted_users                                                        1689764
cast_total_facebook_likes                                               656730
facenumber_in_poster                                                        43
movie_imdb_link              http://www.imdb.com/title/tt5574490/?ref_=fn_t...
num_user_for_reviews                                

In [182]:
movie.mean()  # 평균값(문자열 값들 제외)

num_critic_for_reviews       1.379889e+02
duration                     1.070908e+02
director_facebook_likes      6.910145e+02
actor_3_facebook_likes       6.312763e+02
actor_1_facebook_likes       6.494488e+03
gross                        4.764451e+07
num_voted_users              8.264492e+04
cast_total_facebook_likes    9.579816e+03
facenumber_in_poster         1.377320e+00
num_user_for_reviews         2.676688e+02
budget                       3.654749e+07
title_year                   2.002448e+03
actor_2_facebook_likes       1.621924e+03
imdb_score                   6.437429e+00
aspect_ratio                 2.222349e+00
movie_facebook_likes         7.348294e+03
dtype: float64

In [183]:
movie.median()  # 중간값

num_critic_for_reviews            108.00
duration                          103.00
director_facebook_likes            48.00
actor_3_facebook_likes            366.00
actor_1_facebook_likes            982.00
gross                        25043962.00
num_voted_users                 33132.50
cast_total_facebook_likes        3049.00
facenumber_in_poster                1.00
num_user_for_reviews              153.00
budget                       19850000.00
title_year                       2005.00
actor_2_facebook_likes            593.00
imdb_score                          6.60
aspect_ratio                        2.35
movie_facebook_likes              159.00
dtype: float64

In [184]:
movie.std()  # 표준편차

num_critic_for_reviews       1.202394e+02
duration                     2.528602e+01
director_facebook_likes      2.832954e+03
actor_3_facebook_likes       1.625875e+03
actor_1_facebook_likes       1.510699e+04
gross                        6.737255e+07
num_voted_users              1.383222e+05
cast_total_facebook_likes    1.816432e+04
facenumber_in_poster         2.023826e+00
num_user_for_reviews         3.729348e+02
budget                       1.002427e+08
title_year                   1.245398e+01
actor_2_facebook_likes       4.011300e+03
imdb_score                   1.127802e+00
aspect_ratio                 1.402940e+00
movie_facebook_likes         1.920602e+04
dtype: float64

In [185]:
movie.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


### 2.2. 메서드 체인 사용

In [191]:
movie.isnull().head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,False,False,True,False


In [190]:
movie.isnull().sum()  # 문서전체의 각 열 마다 null 값의 합

color                         19
director_name                102
num_critic_for_reviews        49
duration                      15
director_facebook_likes      102
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        862
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                152
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               300
budget                       484
title_year                   106
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 326
movie_facebook_likes           0
dtype: int64

In [192]:
movie.isnull().sum().sum()  # 문서 전체에 null 값의 합

2654

In [193]:
movie.isnull().any()  # 문서 전체의 각 열마다 null 값의 존재 유무

color                         True
director_name                 True
num_critic_for_reviews        True
duration                      True
director_facebook_likes       True
actor_3_facebook_likes        True
actor_2_name                  True
actor_1_facebook_likes        True
gross                         True
genres                       False
actor_1_name                  True
movie_title                  False
num_voted_users              False
cast_total_facebook_likes    False
actor_3_name                  True
facenumber_in_poster          True
plot_keywords                 True
movie_imdb_link              False
num_user_for_reviews          True
language                      True
country                       True
content_rating                True
budget                        True
title_year                    True
actor_2_facebook_likes        True
imdb_score                   False
aspect_ratio                  True
movie_facebook_likes         False
dtype: bool

In [194]:
movie.any().any()  # 문서 전체의 null 값 존재 여부

True

In [6]:
movie.select_dtypes(['object'])\
    .fillna('')\
    .max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
movie_title                                                 Æon Flux
actor_3_name                                           Óscar Jaenada
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
dtype: object

In [8]:
(movie.select_dtypes(include=['number']) + 0.0001).head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0001,178.0001,0.0001,855.0001,1000.0001,760505800.0,886204.0,4834.0001,0.0001,3054.0001,237000000.0,2009.0001,936.0001,7.9001,1.7801,33000.0001
1,302.0001,169.0001,563.0001,1000.0001,40000.0001,309404200.0,471220.0,48350.0001,0.0001,1238.0001,300000000.0,2007.0001,5000.0001,7.1001,2.3501,0.0001
2,602.0001,148.0001,0.0001,161.0001,11000.0001,200074200.0,275868.0,11700.0001,1.0001,994.0001,245000000.0,2015.0001,393.0001,6.8001,2.3501,85000.0001
3,813.0001,164.0001,22000.0001,23000.0001,27000.0001,448130600.0,1144337.0,106759.0001,0.0001,2701.0001,250000000.0,2012.0001,23000.0001,8.5001,2.3501,164000.0001
4,,,131.0001,,131.0001,,8.0001,143.0001,0.0001,,,,12.0001,7.1001,,0.0001


In [2]:
(movie.select_dtypes(include=['number']) + 0.0001).round(-2).head()

NameError: name 'movie' is not defined

## 3. 대학 캠퍼스의 다양성 지수 발견

In [2]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.head(3)

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0


In [3]:
college.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7535 entries, Alabama A & M University to Excel Learning Center-San Antonio South
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CITY                7535 non-null   object 
 1   STABBR              7535 non-null   object 
 2   HBCU                7164 non-null   float64
 3   MENONLY             7164 non-null   float64
 4   WOMENONLY           7164 non-null   float64
 5   RELAFFIL            7535 non-null   int64  
 6   SATVRMID            1185 non-null   float64
 7   SATMTMID            1196 non-null   float64
 8   DISTANCEONLY        7164 non-null   float64
 9   UGDS                6874 non-null   float64
 10  UGDS_WHITE          6874 non-null   float64
 11  UGDS_BLACK          6874 non-null   float64
 12  UGDS_HISP           6874 non-null   float64
 13  UGDS_ASIAN          6874 non-null   float64
 14  UGDS_AIAN           6874 non-null   float64
 15  UG

In [4]:
college.isnull().sum().sum()

24808

In [5]:
college_ug = college.filter(like='UGDS_')
college_ug.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [6]:
college_ug.isnull().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [7]:
(college_ug.isnull().sum(axis=0) > 0).sum()

9

In [8]:
(college_ug.isnull().sum(axis=1) > 0).sum()

661

In [9]:
college_ug = college_ug.dropna(how='all')

In [10]:
college_ug.isnull().sum().sum()

0

In [15]:
college_metric = (college_ug >= 0.15).sum(axis=1)
college_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [16]:
college_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [17]:
college_ug['UGDS_WHITE'].value_counts()

0.0000    242
1.0000    109
0.6667     22
0.5000     18
0.8000     15
         ... 
0.3962      1
0.6631      1
0.8231      1
0.7670      1
0.0948      1
Name: UGDS_WHITE, Length: 4397, dtype: int64

In [23]:
college_metric.sort_values(ascending=False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64