# 영화 평점 분석 실습

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## 1. 영화 평점 데이터 적재 및 전처리

### 제 기준 path : '../data/~'

In [2]:
# 사용자 데이터(users) 읽어오기
users = pd.read_csv('../data/movielens/users.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '성별','연령','직업','지역'])
users.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
# 평점 데이터(ratings) 읽어오기
ratings = pd.read_csv('../data/movielens/ratings.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '영화아이디','평점','타임스탬프'])
ratings.head()

Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 영화데이터(movies) 읽어오기
movies = pd.read_csv('../data/movielens/movies.dat', sep = '::', engine = 'python',
                   names = ['영화아이디','영화제목','장르'], encoding = 'latin-1')
movies.head()

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
#3개의 데이터프레임을 하나로 합치기
data = pd.merge(users, ratings)
data = pd.merge(data, movies)
data.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


## 2. 보고 싶은 영화 찾기
### 영화들의 평점 평균을 구하여, 사람들에게 인정받는 (평점이 높은) 영화 찾기

__`DataFrame.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None)`__
- __values__ : _column to aggregate, optional_ 
<br>데이터로 사용할 column


- __index__ : _column, Groper, array, or list of the previous_ 
<br>row label이 될 column


- __columns__ : _column, Groper, array, or list of the previous_ 
<br>column label이 될 column


- __aggfunc__ : _function, list of functions, dict, default numpy.mean_
<br> ex)
    - aggfunc = 'sum'  해당 방법이 제일 간단한 것 같음
    - aggfunc = sum
    - aggfunc = [sum]
    - aggfunc = ['sum']

In [6]:
# 영화아이디는 고유값이지만, 영화제목는 중복될 위험성이 있음.
ratings_by_movie = data.pivot_table(index='영화제목', values='평점', aggfunc=np.mean)
ratings_by_movie.head()

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",3.027027
'Night Mother (1986),3.371429
'Til There Was You (1997),2.692308
"'burbs, The (1989)",2.910891
...And Justice for All (1979),3.713568


__`DataFrame.nunique(axis=0, dropna=True)`__
<br> : Return number of unique elements

__`DataFrame.count(axis=0, level=None)`__
<br> : Count non-NA cells for each column or row

In [7]:
# 1. 영화제목 중복값 있는지(동일한 영화가 있는지) 확인
movies.nunique()
# 동일한 영화제목 없으므로, 영화제목으로 groupby 수행

영화아이디    3883
영화제목     3883
장르        301
dtype: int64

In [8]:
# 2. 만약, 동일한 영화제목이 있다면? 
# 영화아이디로 groupby 수행 -> movies와 합쳐 제목을 추가
data.pivot_table(index=['영화아이디', '영화제목'], values='평점', aggfunc='mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,평점
영화아이디,영화제목,Unnamed: 2_level_1
1,Toy Story (1995),4.146846
2,Jumanji (1995),3.201141
3,Grumpier Old Men (1995),3.016736
4,Waiting to Exhale (1995),2.729412
5,Father of the Bride Part II (1995),3.006757
...,...,...
3948,Meet the Parents (2000),3.635731
3949,Requiem for a Dream (2000),4.115132
3950,Tigerland (2000),3.666667
3951,Two Family House (2000),3.900000


In [9]:
# 평점이 높은 상위 10개 선택
ratings_by_movie.nlargest(10, '평점')

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
"Baby, The (1973)",5.0
Bittersweet Motel (2000),5.0
Follow the Bitch (1998),5.0
"Gate of Heavenly Peace, The (1995)",5.0
Lured (1947),5.0
One Little Indian (1973),5.0
Schlafes Bruder (Brother of Sleep) (1995),5.0
Smashing Time (1967),5.0
Song of Freedom (1936),5.0
Ulysses (Ulisse) (1954),5.0


평균 평점이 만점인 영화들이 최상위에 위치함. 
<br>일반적으로 평점이 만점인 경우는 대부분 평점의 개수가 매우 적은 경우이므로, <br>이를 확인하기 위해 평점의 개수도 함께 구해본다. 

In [10]:
ratings_by_movie = data.pivot_table(index='영화제목', values='평점',aggfunc=['mean', 'count'])
ratings_by_movie

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",3.027027,37
'Night Mother (1986),3.371429,70
'Til There Was You (1997),2.692308,52
"'burbs, The (1989)",2.910891,303
...And Justice for All (1979),3.713568,199
...,...,...
"Zed & Two Noughts, A (1985)",3.413793,29
Zero Effect (1998),3.750831,301
Zero Kelvin (Kjærlighetens kjøtere) (1995),3.500000,2
Zeus and Roxanne (1997),2.521739,23


In [11]:
# Multi Index (계층색인)
ratings_by_movie.columns

MultiIndex([( 'mean', '평점'),
            ('count', '평점')],
           )

In [12]:
# Renaming
ratings_by_movie.columns = ['평균', '개수']
ratings_by_movie.head(3)

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.027027,37
'Night Mother (1986),3.371429,70
'Til There Was You (1997),2.692308,52


In [13]:
ratings_by_movie.nlargest(10, '평균')

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"Baby, The (1973)",5.0,1
Bittersweet Motel (2000),5.0,1
Follow the Bitch (1998),5.0,1
"Gate of Heavenly Peace, The (1995)",5.0,3
Lured (1947),5.0,1
One Little Indian (1973),5.0,1
Schlafes Bruder (Brother of Sleep) (1995),5.0,1
Smashing Time (1967),5.0,2
Song of Freedom (1936),5.0,1
Ulysses (Ulisse) (1954),5.0,1


In [14]:
# 평점이 4.5이상, 평점의 개수가 1000개 이상인 영화
ratings_by_movie[(ratings_by_movie.평균 >= 4.5) & (ratings_by_movie.개수 >= 1000)]

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",4.524966,2223
Schindler's List (1993),4.510417,2304
"Shawshank Redemption, The (1994)",4.554558,2227
"Usual Suspects, The (1995)",4.517106,1783


In [15]:
# 평점이 4.3이상인 영화 중 평점 개수가 많은 상위 10개 영화 선택
ratings_by_movie[ratings_by_movie.평균 >= 4.3].nlargest(10, '개수')

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.317386,3428
Star Wars: Episode IV - A New Hope (1977),4.453694,2991
Saving Private Ryan (1998),4.337354,2653
"Matrix, The (1999)",4.31583,2590
"Silence of the Lambs, The (1991)",4.351823,2578
Raiders of the Lost Ark (1981),4.477725,2514
"Sixth Sense, The (1999)",4.406263,2459
"Princess Bride, The (1987)",4.30371,2318
Schindler's List (1993),4.510417,2304
"Shawshank Redemption, The (1994)",4.554558,2227


## [실습 #1] 여자들이 좋아하는 영화 찾기 
### - 여성 평점이 4.0 이상이고 여성 평점의 개수가 500개 이상인 영화

In [16]:
data.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [17]:
# 1. 여성이 매긴 평점 데이터만 선택
data[data.성별 == 'F'].pivot_table(index='영화제목', values = '평점', aggfunc=['mean', 'count'])

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",3.375000,16
'Night Mother (1986),3.388889,36
'Til There Was You (1997),2.675676,37
"'burbs, The (1989)",2.793478,92
...And Justice for All (1979),3.828571,35
...,...,...
Your Friends and Neighbors (1998),2.888889,27
"Zed & Two Noughts, A (1985)",3.500000,8
Zero Effect (1998),3.864407,59
Zeus and Roxanne (1997),2.777778,9


---

In [18]:
# 2. 영화별 성별 평점
movie_rating_gender = data.pivot_table(index='영화제목', columns='성별', values = '평점', aggfunc=['mean', 'count'])
movie_rating_gender.head(3)

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck (1971)",3.375,2.761905,16.0,21.0
'Night Mother (1986),3.388889,3.352941,36.0,34.0
'Til There Was You (1997),2.675676,2.733333,37.0,15.0


In [19]:
# Multi Index이므로, tuple 형태로 묶어주기
cond = (movie_rating_gender[('mean', 'F')] >= 4.0) & (movie_rating_gender[('count', 'F')] >= 500)

female_popular_movies = movie_rating_gender[cond]
female_popular_movies.head(3)

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
American Beauty (1999),4.238901,4.347301,946.0,2482.0
Being John Malkovich (1999),4.15993,4.113636,569.0,1672.0
Braveheart (1995),4.016484,4.297839,546.0,1897.0


---

In [20]:
str_cond = '성별 == "F"'
female_ratings = data.query(str_cond)
female_ratings.head(3)

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama


In [21]:
female_high_ratings = female_ratings.pivot_table(index='영화제목', values='평점', aggfunc=['mean', 'count'])
female_high_ratings

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",3.375000,16
'Night Mother (1986),3.388889,36
'Til There Was You (1997),2.675676,37
"'burbs, The (1989)",2.793478,92
...And Justice for All (1979),3.828571,35
...,...,...
Your Friends and Neighbors (1998),2.888889,27
"Zed & Two Noughts, A (1985)",3.500000,8
Zero Effect (1998),3.864407,59
Zeus and Roxanne (1997),2.777778,9


In [22]:
cond = (female_high_ratings[('mean', '평점')] >= 4.0) & (female_high_ratings[('count', '평점')] >= 500)
female_high_ratings[cond].head(3)

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
American Beauty (1999),4.238901,946
Being John Malkovich (1999),4.15993,569
Braveheart (1995),4.016484,546


---

In [23]:
# groupby 사용한 방법
ratings_count = female_ratings.영화제목.value_counts() >= 500
targets = ratings_count[ratings_count.values == True].index
targets

Index(['American Beauty (1999)', 'Shakespeare in Love (1998)',
       'Silence of the Lambs, The (1991)', 'Sixth Sense, The (1999)',
       'Groundhog Day (1993)', 'Fargo (1996)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Forrest Gump (1994)',
       'Back to the Future (1985)', 'Princess Bride, The (1987)',
       'Shawshank Redemption, The (1994)', 'Schindler's List (1993)',
       'E.T. the Extra-Terrestrial (1982)', 'Toy Story (1995)', 'Babe (1995)',
       'Saving Private Ryan (1998)', 'Jurassic Park (1993)',
       'Raiders of the Lost Ark (1981)', 'Being John Malkovich (1999)',
       'L.A. Confidential (1997)', 'Braveheart (1995)', 'Pulp Fiction (1994)',
       'Men in Black (1997)', 'Ghostbusters (1984)', 'Matrix, The (1999)',
       'Wizard of Oz, The (1939)', 'Casablanca (1942)'],
      dtype='object')

In [24]:
female_mean_ratings = pd.DataFrame(female_ratings.groupby('영화제목').평점.mean())
female_ratings500 = female_mean_ratings.loc[targets]
female_high_ratings = female_ratings500[female_ratings500.평점 >= 4.0]
female_high_ratings.head(3)

Unnamed: 0,평점
American Beauty (1999),4.238901
Shakespeare in Love (1998),4.181704
"Silence of the Lambs, The (1991)",4.271955


## [실습 #2] 실습 #1에서 구한 영화 (*female_popular_movies*)의 장르를 분석해 보자.
### 여성인기영화의 장르 통계 구하기
#### ex) 여성인기영화 중 Drama 장르의 영화는 10개, Action 영화는 3개, ...

In [25]:
female_popular_movies.head(3)

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
American Beauty (1999),4.238901,4.347301,946.0,2482.0
Being John Malkovich (1999),4.15993,4.113636,569.0,1672.0
Braveheart (1995),4.016484,4.297839,546.0,1897.0


In [26]:
movies.head(3)

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


#### 1. `DataFrame.isin(values)` 활용

In [27]:
# movies에서 Toy Story, Jumanji 영화만 선택 (isin)
movies[movies.영화제목.isin(['Toy Story (1995)', 'Jumanji (1995)'])]

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [28]:
# 1. isin 함수 활용
female_popular_movies.index
movies[movies.영화제목.isin(female_popular_movies.index)].장르

0              Animation|Children's|Comedy
108                       Action|Drama|War
257        Action|Adventure|Fantasy|Sci-Fi
293                            Crime|Drama
315                                  Drama
352                     Comedy|Romance|War
523                              Drama|War
589                         Drama|Thriller
604                   Crime|Drama|Thriller
900                      Drama|Romance|War
907     Adventure|Children's|Drama|Musical
1081       Children's|Drama|Fantasy|Sci-Fi
1178     Action|Adventure|Drama|Sci-Fi|War
1179       Action|Adventure|Comedy|Romance
1180                      Action|Adventure
1575      Crime|Film-Noir|Mystery|Thriller
1959                      Action|Drama|War
2327                        Comedy|Romance
2502                Action|Sci-Fi|Thriller
2693                              Thriller
2789                          Comedy|Drama
2928                                Comedy
Name: 장르, dtype: object

#### 2. `pandas.merge()` 활용

In [29]:
# 2. merge() 활용해 하나로 합치기
pd.merge(female_popular_movies, movies, left_index=True, right_on='영화제목').장르

  pd.merge(female_popular_movies, movies, left_index=True, right_on='영화제목').장르


2789                          Comedy|Drama
2928                                Comedy
108                       Action|Drama|War
900                      Drama|Romance|War
1081       Children's|Drama|Fantasy|Sci-Fi
604                   Crime|Drama|Thriller
352                     Comedy|Romance|War
1575      Crime|Film-Noir|Mystery|Thriller
2502                Action|Sci-Fi|Thriller
1179       Action|Adventure|Comedy|Romance
293                            Crime|Drama
1180                      Action|Adventure
1959                      Action|Drama|War
523                              Drama|War
2327                        Comedy|Romance
315                                  Drama
589                         Drama|Thriller
2693                              Thriller
257        Action|Adventure|Fantasy|Sci-Fi
1178     Action|Adventure|Drama|Sci-Fi|War
0              Animation|Children's|Comedy
907     Adventure|Children's|Drama|Musical
Name: 장르, dtype: object

#### 3. `pandas.concat()` 활용

In [30]:
movies.set_index('영화제목').head(3)

Unnamed: 0_level_0,영화아이디,장르
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1,Animation|Children's|Comedy
Jumanji (1995),2,Adventure|Children's|Fantasy
Grumpier Old Men (1995),3,Comedy|Romance


In [31]:
# 3. concat() 활용
female_genre = pd.concat([female_popular_movies, movies.set_index('영화제목')], axis=1, join='inner').장르
female_genre.head()

영화제목
American Beauty (1999)                                  Comedy|Drama
Being John Malkovich (1999)                                   Comedy
Braveheart (1995)                                   Action|Drama|War
Casablanca (1942)                                  Drama|Romance|War
E.T. the Extra-Terrestrial (1982)    Children's|Drama|Fantasy|Sci-Fi
Name: 장르, dtype: object

__`Series.str.split(pat=None, n=- 1, expand=False, *, regex=None)`__
<br> : 주어진 sep(delimiter)를 기준으로 string을 쪼개서 list로 반환
- __expand__ : _bool, default False_
<br>Expand the split strings into separate columns.
    - If _True_, return DataFrame/MultiIndex expanding dimensionality.
    - If _False_, return Series/Index, containing lists of strings.

In [32]:
female_genre.str.split('|').head(3)

영화제목
American Beauty (1999)              [Comedy, Drama]
Being John Malkovich (1999)                [Comedy]
Braveheart (1995)              [Action, Drama, War]
Name: 장르, dtype: object

In [33]:
female_genre = female_genre.str.split('|', expand=True)
#개별적인 column으로 만들어줌

female_genre.head(3)

Unnamed: 0_level_0,0,1,2,3,4
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
American Beauty (1999),Comedy,Drama,,,
Being John Malkovich (1999),Comedy,,,,
Braveheart (1995),Action,Drama,War,,


In [34]:
female_genre[0].value_counts()

Action        7
Comedy        4
Drama         4
Crime         3
Children's    1
Thriller      1
Animation     1
Adventure     1
Name: 0, dtype: int64

In [35]:
# Series 연산하면 row index 같은 것끼리 연산
female_genre[0].value_counts() + female_genre[1].value_counts() 

Action         NaN
Adventure      5.0
Animation      NaN
Children's     3.0
Comedy         NaN
Crime          NaN
Drama         10.0
Film-Noir      NaN
Romance        NaN
Sci-Fi         NaN
Thriller       2.0
War            NaN
dtype: float64

In [36]:
female_genre[0].value_counts().add(female_genre[1].value_counts(), fill_value=0)
# fill_value=0 : NaN 0으로 채우기

Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         4.0
Crime          3.0
Drama         10.0
Film-Noir      1.0
Romance        3.0
Sci-Fi         1.0
Thriller       2.0
War            1.0
dtype: float64

In [37]:
female_genre_cnt = Series(dtype='int64')

for col in female_genre.columns: # column 0 ~ 4
    temp = female_genre[col].value_counts()
    female_genre_cnt = female_genre_cnt.add(temp, fill_value = 0)
    
female_genre_cnt

Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         6.0
Crime          3.0
Drama         12.0
Fantasy        2.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
Romance        4.0
Sci-Fi         4.0
Thriller       5.0
War            6.0
dtype: float64

In [38]:
female_genre_cnt.sort_values(ascending=False)

Drama         12.0
Action         7.0
Comedy         6.0
War            6.0
Adventure      5.0
Thriller       5.0
Romance        4.0
Sci-Fi         4.0
Children's     3.0
Crime          3.0
Fantasy        2.0
Animation      1.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
dtype: float64

## [실습 #3] 남자와 여자의 호불호가 크게 갈리는 영화 10개 찾기
### 전체 평점의 개수가 500개 이상인 영화만 대상으로 함.

In [39]:
movie_rating_gender.head(3)

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck (1971)",3.375,2.761905,16.0,21.0
'Night Mother (1986),3.388889,3.352941,36.0,34.0
'Til There Was You (1997),2.675676,2.733333,37.0,15.0


In [40]:
# 1. 전체 평점의 개수가 500개 이상인 영화만 선택
# 여성 평점 개수 + 남성 평점 개수
cond = movie_rating_gender[('count', 'F')] + movie_rating_gender[('count', 'M')] >= 500

movie500 = movie_rating_gender[cond]

movie500

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
10 Things I Hate About You (1999),3.646552,3.311966,232.0,468.0
101 Dalmatians (1961),3.791444,3.500000,187.0,378.0
12 Angry Men (1957),4.184397,4.328421,141.0,475.0
"13th Warrior, The (1999)",3.112000,3.168000,125.0,625.0
"20,000 Leagues Under the Sea (1954)",3.670103,3.709205,97.0,478.0
...,...,...,...,...
"X-Files: Fight the Future, The (1998)",3.489474,3.493797,190.0,806.0
X-Men (2000),3.682310,3.851702,277.0,1234.0
You've Got Mail (1998),3.542424,3.275591,330.0,508.0
Young Frankenstein (1974),4.289963,4.239177,269.0,924.0


In [41]:
# 'diff'(여성 평균 평점과 남성 평균 평점 차) 새로운 column 생성
movie500['diff'] = abs(movie500[('mean', 'F')] - movie500[('mean', 'M')])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie500['diff'] = abs(movie500[('mean', 'F')] - movie500[('mean', 'M')])


In [42]:
movie500.head(3)

Unnamed: 0_level_0,mean,mean,count,count,diff
성별,F,M,F,M,Unnamed: 5_level_1
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10 Things I Hate About You (1999),3.646552,3.311966,232.0,468.0,0.334586
101 Dalmatians (1961),3.791444,3.5,187.0,378.0,0.291444
12 Angry Men (1957),4.184397,4.328421,141.0,475.0,0.144024


In [43]:
# 차이가 큰 순으로 상위 10개 data
movie500.nlargest(10, 'diff')

Unnamed: 0_level_0,mean,mean,count,count,diff
성별,F,M,F,M,Unnamed: 5_level_1
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dirty Dancing (1987),3.790378,2.959596,291.0,396.0,0.830782
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,99.0,723.0,0.726351
Dumb & Dumber (1994),2.697987,3.336595,149.0,511.0,0.638608
Evil Dead II (Dead By Dawn) (1987),3.297297,3.909283,74.0,474.0,0.611985
Grease (1978),3.975265,3.367041,283.0,534.0,0.608224
Caddyshack (1980),3.396135,3.969737,207.0,760.0,0.573602
Animal House (1978),3.628906,4.167192,256.0,951.0,0.538286
"Exorcist, The (1973)",3.537634,4.067239,186.0,699.0,0.529605
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,315.0,918.0,0.512885
Big Trouble in Little China (1986),2.987952,3.48503,83.0,501.0,0.497078


---

In [44]:
# groupby 사용한 다른 방법
ratings_count = data.영화제목.value_counts() >= 500
targets = ratings_count[ratings_count.values == True].index
targets

Index(['American Beauty (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Jurassic Park (1993)', 'Saving Private Ryan (1998)',
       'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)',
       'Back to the Future (1985)', 'Silence of the Lambs, The (1991)',
       ...
       'Ice Storm, The (1997)', 'Peggy Sue Got Married (1986)',
       'Halloween (1978)', '28 Days (2000)', 'Body Heat (1981)',
       'Alien Nation (1988)', 'Guns of Navarone, The (1961)',
       'Postino, Il (The Postman) (1994)', 'Emma (1996)',
       'Last Action Hero (1993)'],
      dtype='object', length=618)

In [45]:
# 평점이 500개 이상인 영화들의 평균 평점
mean_ratings = pd.DataFrame(data.groupby(['영화제목', '성별']).평점.mean())
mean_ratings = mean_ratings.loc[targets]
mean_ratings.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,평점
영화제목,성별,Unnamed: 2_level_1
American Beauty (1999),F,4.238901
American Beauty (1999),M,4.347301
Star Wars: Episode IV - A New Hope (1977),F,4.302937
Star Wars: Episode IV - A New Hope (1977),M,4.495307


In [46]:
male_mean_ratings = mean_ratings.xs('M', level=1).groupby('영화제목')['평점'].mean()
male_mean_ratings.head(3)

영화제목
10 Things I Hate About You (1999)    3.311966
101 Dalmatians (1961)                3.500000
12 Angry Men (1957)                  4.328421
Name: 평점, dtype: float64

In [47]:
female_mean_ratings = mean_ratings.xs('F', level=1).groupby('영화제목').평점.mean()
female_mean_ratings.head(3)

영화제목
10 Things I Hate About You (1999)    3.646552
101 Dalmatians (1961)                3.791444
12 Angry Men (1957)                  4.184397
Name: 평점, dtype: float64

In [48]:
diff_mean_ratings = abs(male_mean_ratings - female_mean_ratings)
diff_mean_ratings.sort_values(ascending=False)

영화제목
Dirty Dancing (1987)                           0.830782
Good, The Bad and The Ugly, The (1966)         0.726351
Dumb & Dumber (1994)                           0.638608
Evil Dead II (Dead By Dawn) (1987)             0.611985
Grease (1978)                                  0.608224
                                                 ...   
Indiana Jones and the Temple of Doom (1984)    0.002256
Fatal Attraction (1987)                        0.002182
Dune (1984)                                    0.002165
Trainspotting (1996)                           0.001457
Jerry Maguire (1996)                           0.001109
Name: 평점, Length: 618, dtype: float64

---

## [실습 #4] 연령대 별로 영화 평점 분석하기
연령대(10대 미만, 10대, 20대, ...50대) 컬럼을 추가한 후, 영화별 연령대별 영화평점 구하기

In [49]:
data.연령.min()

1

In [50]:
data.연령.max()

56

In [51]:
data.연령.unique()

array([ 1, 56, 25, 50, 18, 45, 35], dtype=int64)

#### 1. `DataFrame.apply(func)` 활용

In [52]:
def generate_agegroup(val):
    if val < 10:
        return '10대 미만'
    elif val < 20:
        return '10대'
    elif val < 30:
        return '20대'
    elif val < 40:
        return '30대'
    elif val < 50:
        return '40대'
    else:
        return '50대 이상'

In [53]:
generate_agegroup(15)

'10대'

In [54]:
data['연령대'] = data.연령.apply(generate_agegroup)

In [55]:
data.head(10)

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르,연령대
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,10대 미만
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,50대 이상
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,20대
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,20대
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,50대 이상
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama,10대
6,19,M,1,10,48073,1193,5,982730936,One Flew Over the Cuckoo's Nest (1975),Drama,10대 미만
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama,20대
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama,20대
9,33,M,45,3,55421,1193,5,978557765,One Flew Over the Cuckoo's Nest (1975),Drama,40대


---

#### 2. `numpy.digitize(x, bins, right=False)` 활용
 : x의 각 값이 속한 구간(bins)의 index 반환 (구간을 나누어 index mapping)
- __x__ : _array_like_
- __bins__ : _array_like_

In [56]:
# 2. np.digitize() 함수 활용
import numpy as np

sr = Series([15, 25, 42, 18, 56, 51])
np.digitize(sr, [10, 20, 30, 40, 50])

#기준값을 가지고 쪼개는 것
# 각각의 값들이 10보다 작으면 0 10 ~ 20 사이면 1 값을 치환해서 전달

array([1, 2, 4, 1, 5, 5], dtype=int64)

In [57]:
data['연령대2'] = np.digitize(data.연령, [10, 20, 30, 40, 50])

In [58]:
# mapping
data['연령대2'] = data.연령대2.map({
    0 : '10대 미만',
    1 : '10대',
    2 : '20대',
    3 : '30대',
    4 : '40대',
    5 : '50대 이상'
})

In [59]:
data.head(10)

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르,연령대,연령대2
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,10대 미만,10대 미만
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,50대 이상,50대 이상
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,20대,20대
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,20대,20대
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,50대 이상,50대 이상
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama,10대,10대
6,19,M,1,10,48073,1193,5,982730936,One Flew Over the Cuckoo's Nest (1975),Drama,10대 미만,10대 미만
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama,20대,20대
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama,20대,20대
9,33,M,45,3,55421,1193,5,978557765,One Flew Over the Cuckoo's Nest (1975),Drama,40대,40대


In [60]:
# 연령대별 영화평점
movie_grade_agegroup = data.pivot_table(index = '영화제목', columns ='연령대', aggfunc = 'mean', values='평점')

In [61]:
movie_grade_agegroup.sort_index(axis= 1)

연령대,10대,10대 미만,20대,30대,40대,50대 이상
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$1,000,000 Duck (1971)",3.000000,,3.090909,3.133333,2.000000,2.750000
'Night Mother (1986),4.666667,2.000000,3.423077,2.904762,3.833333,3.750000
'Til There Was You (1997),2.500000,3.500000,2.666667,2.900000,2.333333,2.600000
"'burbs, The (1989)",3.244444,4.500000,2.652174,2.818182,2.545455,3.100000
...And Justice for All (1979),3.428571,3.000000,3.724138,3.657143,4.100000,3.674419
...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",3.000000,1.000000,3.375000,3.777778,4.000000,3.000000
Zero Effect (1998),3.883333,4.125000,3.715278,3.608696,3.764706,3.769231
Zero Kelvin (Kjærlighetens kjøtere) (1995),,,,3.500000,,
Zeus and Roxanne (1997),2.500000,1.500000,2.833333,3.500000,1.000000,


In [62]:
# 사용자가 원하는 순서대로 column label 재설정
movie_grade_agegroup[['10대 미만', '10대', '20대','30대','40대','50대 이상',
]]

연령대,10대 미만,10대,20대,30대,40대,50대 이상
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$1,000,000 Duck (1971)",,3.000000,3.090909,3.133333,2.000000,2.750000
'Night Mother (1986),2.000000,4.666667,3.423077,2.904762,3.833333,3.750000
'Til There Was You (1997),3.500000,2.500000,2.666667,2.900000,2.333333,2.600000
"'burbs, The (1989)",4.500000,3.244444,2.652174,2.818182,2.545455,3.100000
...And Justice for All (1979),3.000000,3.428571,3.724138,3.657143,4.100000,3.674419
...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",1.000000,3.000000,3.375000,3.777778,4.000000,3.000000
Zero Effect (1998),4.125000,3.883333,3.715278,3.608696,3.764706,3.769231
Zero Kelvin (Kjærlighetens kjøtere) (1995),,,,3.500000,,
Zeus and Roxanne (1997),1.500000,2.500000,2.833333,3.500000,1.000000,


In [63]:
# NaN 비어있는 값, 값이 아예 없는 것 NaN 0으로 치환할 수 없음

In [64]:
movie_grade_agegroup.fillna('-')
# NaN를  - 로 치환

연령대,10대,10대 미만,20대,30대,40대,50대 이상
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$1,000,000 Duck (1971)",3.0,-,3.090909,3.133333,2.0,2.75
'Night Mother (1986),4.666667,2.0,3.423077,2.904762,3.833333,3.75
'Til There Was You (1997),2.5,3.5,2.666667,2.9,2.333333,2.6
"'burbs, The (1989)",3.244444,4.5,2.652174,2.818182,2.545455,3.1
...And Justice for All (1979),3.428571,3.0,3.724138,3.657143,4.1,3.674419
...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",3.0,1.0,3.375,3.777778,4.0,3.0
Zero Effect (1998),3.883333,4.125,3.715278,3.608696,3.764706,3.769231
Zero Kelvin (Kjærlighetens kjøtere) (1995),-,-,-,3.5,-,-
Zeus and Roxanne (1997),2.5,1.5,2.833333,3.5,1.0,-


In [65]:
movie_grade_agegroup.fillna('-').mean()
#평균값을 구할 수 없음

  movie_grade_agegroup.fillna('-').mean()


Series([], dtype: float64)

In [66]:
movie_grade_agegroup.mean()

연령대
10대       3.165096
10대 미만    3.349692
20대       3.230790
30대       3.286723
40대       3.296155
50대 이상    3.331886
dtype: float64

---

In [67]:
# '연령대' column 생성하는 다른 방법
data.loc[                     (data.연령 < 10),'연령대'] = '10대 미만'
data.loc[(data.연령 >= 10) & (data.연령 < 20), '연령대'] = '10대'
data.loc[(data.연령 >= 20) & (data.연령 < 30), '연령대'] = '20대'
data.loc[(data.연령 >= 30) & (data.연령 < 40), '연령대'] = '30대'
data.loc[(data.연령 >= 40) & (data.연령 < 50), '연령대'] = '40대'
data.loc[                   (data.연령 >= 50), '연령대'] = '50대 이상'

In [68]:
data.groupby('연령대')['평점'].mean()

연령대
10대       3.507573
10대 미만    3.549520
20대       3.545235
30대       3.618162
40대       3.638062
50대 이상    3.732677
Name: 평점, dtype: float64