# 영화 평점 분석 실습

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## 1. 영화 평점 데이터 적재 및 전처리

In [2]:
# 사용자 데이터 읽어오기
users = pd.read_csv('data/movielens/users.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '성별','연령','직업','지역'])
users.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
# 평점 데이터 읽어오기
ratings = pd.read_csv('data/movielens/ratings.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '영화아이디','평점','타임스탬프'])
ratings.head()

Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 영화데이터 읽어오기
movies = pd.read_csv('data/movielens/movies.dat', sep = '::', engine = 'python',
                   names = ['영화아이디','영화제목','장르'], encoding = 'latin-1')
movies.head()

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
print('사용자수:',len(users))
print('평점수:',len(ratings))
print('영화수:',len(movies))

사용자수: 6040
평점수: 1000209
영화수: 3883


In [9]:
# 평점을 안준 사용자? 없다  -->177
ratings.사용자아이디.nunique()

6040

In [10]:
#평점 없는 영화
ratings.영화아이디.nunique()

3706

In [None]:
#3개의 데이터프레임을 하나로 합치기

In [11]:
data = pd.merge(users,ratings)
data= pd.merge(data,movies)
data.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


## 2. 보고 싶은 영화 찾기
영화들의 평점 평균을 구하여, 사람들에게 인정받는 (평점이 높은) 영화 찾기

In [17]:
#영화제목 중복 체크
movies.영화제목.nunique()
movies.nunique()

영화아이디    3883
영화제목     3883
장르        301
dtype: int64

In [21]:
# 영화들의 평점 평균을 구하여, 평점이 높은 영화 찾기
data.pivot_table(index=['영화아이디','영화제목'],
                 values= '평점',
                 aggfunc = 'mean').nlargest(10, '평점')

Unnamed: 0_level_0,Unnamed: 1_level_0,평점
영화아이디,영화제목,Unnamed: 2_level_1
787,"Gate of Heavenly Peace, The (1995)",5.0
989,Schlafes Bruder (Brother of Sleep) (1995),5.0
1830,Follow the Bitch (1998),5.0
3172,Ulysses (Ulisse) (1954),5.0
3233,Smashing Time (1967),5.0
3280,"Baby, The (1973)",5.0
3382,Song of Freedom (1936),5.0
3607,One Little Indian (1973),5.0
3656,Lured (1947),5.0
3881,Bittersweet Motel (2000),5.0


평균 평점이 만점인 영화들이 최상위에 위치함. 
일반적으로 평점이 만점인 경우는 대부분 평점의 개수가 매우 적은 경우이므로, 이를 확인하기 위해 평점의 개수도 함께 구해본다. 

In [32]:
# 영화들의 평점 평균을 구하여, 평점이 높은 영화 찾기
ex = data.pivot_table(index='영화제목',
                 values= '평점',
                 aggfunc = ['mean','count'])
ex.columns = ['평균', '개수']

In [33]:
ex[(ex.평균 >= 4.5) & (ex.개수 >=1000)]

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",4.524966,2223
Schindler's List (1993),4.510417,2304
"Shawshank Redemption, The (1994)",4.554558,2227
"Usual Suspects, The (1995)",4.517106,1783


## [실습 #1] 여자들이 좋아하는 영화 찾기 
### - 여성 평점이 4.0 이상이고 여성 평점의 개수가 500개 이상인 영화

In [None]:
#groupby 느낌으로 하는 것이 좋을 듯

In [73]:
#여성만 데이터 뽑기
data_f=data[data.성별=='F']
data_f

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르,연령대
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama,10
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama,20
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama,20
19,59,F,50,1,55413,1193,4,977934292,One Flew Over the Cuckoo's Nest (1975),Drama,50
...,...,...,...,...,...,...,...,...,...,...,...
1000199,5334,F,56,13,46140,3382,5,960796159,Song of Freedom (1936),Drama,50
1000200,5420,F,1,19,14850,1843,3,960156505,Slappy and the Stinkers (1998),Children's|Comedy,0
1000201,5433,F,35,17,45014,286,3,960240881,Nemesis 2: Nebula (1995),Action|Sci-Fi|Thriller,30
1000202,5494,F,35,17,94306,3530,4,959816296,Smoking/No Smoking (1993),Comedy,30


In [74]:
#pivot_table
ex2 = data_f.pivot_table(index='영화제목',
                 values= '평점',
                 aggfunc = ['mean','count'])
ex2.columns = ['평균', '개수']
ex2

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,16
'Night Mother (1986),3.388889,36
'Til There Was You (1997),2.675676,37
"'burbs, The (1989)",2.793478,92
...And Justice for All (1979),3.828571,35
...,...,...
Your Friends and Neighbors (1998),2.888889,27
"Zed & Two Noughts, A (1985)",3.500000,8
Zero Effect (1998),3.864407,59
Zeus and Roxanne (1997),2.777778,9


In [75]:
fam_f=ex2[(ex2.평균 >= 4.0) & (ex2.개수 >=500)]

In [129]:
#결과
fam_f

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.238901,946
Being John Malkovich (1999),4.15993,569
Braveheart (1995),4.016484,546
Casablanca (1942),4.30099,505
E.T. the Extra-Terrestrial (1982),4.08985,601
Fargo (1996),4.217656,657
Forrest Gump (1994),4.045031,644
L.A. Confidential (1997),4.106007,566
"Matrix, The (1999)",4.128405,514
"Princess Bride, The (1987)",4.342767,636


In [None]:
#교수님 코드 start

In [132]:
data[data.성별=='F']
ex1 = data.pivot_table(index = '영화제목',columns='성별',values='평점',aggfunc = ['mean','count'])
ex1

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck (1971)",3.375000,2.761905,16.0,21.0
'Night Mother (1986),3.388889,3.352941,36.0,34.0
'Til There Was You (1997),2.675676,2.733333,37.0,15.0
"'burbs, The (1989)",2.793478,2.962085,92.0,211.0
...And Justice for All (1979),3.828571,3.689024,35.0,164.0
...,...,...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952,8.0,21.0
Zero Effect (1998),3.864407,3.723140,59.0,242.0
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000,,2.0
Zeus and Roxanne (1997),2.777778,2.357143,9.0,14.0


In [134]:
ex1=ex1.xs('F',axis=1,level=1)

In [136]:
여성인기영화 = ex1[(ex1['mean'] >= 4.0) & (ex1['count'] >=500)]

## [실습 #2] 실습 #1에서 구한 영화(여성인기영화)의 장르를 분석해 보자.
여성인기영화의 장르 통계 구하기

예를 들어, 여성인기영화 중 Drama 장르의 영화는 10개, Action 영화는 3개, ...

In [119]:
genre=pd.merge(movies,fam_f,on='영화제목', how='inner')

In [120]:
genre['장르']

0            Animation|Children's|Comedy
1                       Action|Drama|War
2        Action|Adventure|Fantasy|Sci-Fi
3                            Crime|Drama
4                                  Drama
5                     Comedy|Romance|War
6                              Drama|War
7                         Drama|Thriller
8                   Crime|Drama|Thriller
9                      Drama|Romance|War
10    Adventure|Children's|Drama|Musical
11       Children's|Drama|Fantasy|Sci-Fi
12     Action|Adventure|Drama|Sci-Fi|War
13       Action|Adventure|Comedy|Romance
14                      Action|Adventure
15      Crime|Film-Noir|Mystery|Thriller
16                      Action|Drama|War
17                        Comedy|Romance
18                Action|Sci-Fi|Thriller
19                              Thriller
20                          Comedy|Drama
21                                Comedy
Name: 장르, dtype: object

In [None]:
#from collections import Counter

In [126]:
genre['장르'][0].split('|')

['Animation', "Children's", 'Comedy']

In [127]:
def countgenre(lists):
    counter = {}
    for genres in lists:
        genrelist=genres.split('|')
        for genre in genrelist:
            if genre not in counter:
                counter[genre] = 0
            counter[genre] += 1
    return counter

In [128]:
#결과
countgenre(genre['장르'])

{'Animation': 1,
 "Children's": 3,
 'Comedy': 6,
 'Action': 7,
 'Drama': 12,
 'War': 6,
 'Adventure': 5,
 'Fantasy': 2,
 'Sci-Fi': 4,
 'Crime': 3,
 'Romance': 4,
 'Thriller': 5,
 'Musical': 1,
 'Film-Noir': 1,
 'Mystery': 1}

In [137]:
#교수님 코드 start

In [138]:
여성인기영화.index

Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
       'Braveheart (1995)', 'Casablanca (1942)',
       'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
       'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
       'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
       'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
       'Schindler's List (1993)', 'Shakespeare in Love (1998)',
       'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
       'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
      dtype='object', name='영화제목')

In [141]:
movies[movies.영화제목.isin(여성인기영화.index)]

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
108,110,Braveheart (1995),Action|Drama|War
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
293,296,Pulp Fiction (1994),Crime|Drama
315,318,"Shawshank Redemption, The (1994)",Drama
352,356,Forrest Gump (1994),Comedy|Romance|War
523,527,Schindler's List (1993),Drama|War
589,593,"Silence of the Lambs, The (1991)",Drama|Thriller
604,608,Fargo (1996),Crime|Drama|Thriller
900,912,Casablanca (1942),Drama|Romance|War


In [142]:
#merge
pd.merge(여성인기영화.reset_index(),movies)

Unnamed: 0,영화제목,mean,count,영화아이디,장르
0,American Beauty (1999),4.238901,946.0,2858,Comedy|Drama
1,Being John Malkovich (1999),4.15993,569.0,2997,Comedy
2,Braveheart (1995),4.016484,546.0,110,Action|Drama|War
3,Casablanca (1942),4.30099,505.0,912,Drama|Romance|War
4,E.T. the Extra-Terrestrial (1982),4.08985,601.0,1097,Children's|Drama|Fantasy|Sci-Fi
5,Fargo (1996),4.217656,657.0,608,Crime|Drama|Thriller
6,Forrest Gump (1994),4.045031,644.0,356,Comedy|Romance|War
7,L.A. Confidential (1997),4.106007,566.0,1617,Crime|Film-Noir|Mystery|Thriller
8,"Matrix, The (1999)",4.128405,514.0,2571,Action|Sci-Fi|Thriller
9,"Princess Bride, The (1987)",4.342767,636.0,1197,Action|Adventure|Comedy|Romance


In [155]:
#count
ex2 = pd.concat([여성인기영화, movies.set_index('영화제목')],axis=1, join='inner').장르
ex2_expand = ex2.str.split('|',expand=True)
result=ex2_expand[0].value_counts()
for i in range(1,5):
    result=result.add(ex2_expand[i].value_counts(),fill_value=0)
result

Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         6.0
Crime          3.0
Drama         12.0
Fantasy        2.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
Romance        4.0
Sci-Fi         4.0
Thriller       5.0
War            6.0
dtype: float64

In [157]:
여성인기장르 = Series(dtype='float64')
for col in ex2_expand.columns:
    여성인기장르=여성인기장르.add(ex2_expand[col].value_counts(),fill_value=0)
여성인기장르.sort_values(ascending=False)

Drama         12.0
Action         7.0
War            6.0
Comedy         6.0
Thriller       5.0
Adventure      5.0
Sci-Fi         4.0
Romance        4.0
Crime          3.0
Children's     3.0
Fantasy        2.0
Mystery        1.0
Musical        1.0
Film-Noir      1.0
Animation      1.0
dtype: float64

## [실습 #3] 남자와 여자의 호불호가 크게 갈리는 영화 10개 찾기
전체 평점의 개수가 500개 이상인 영화만 대상으로 함.

In [None]:
# 아 각각 500개 이상인 것을 구해버림

In [42]:
data_m=data[data.성별=='M']
data_m

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
6,19,M,1,10,48073,1193,5,982730936,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000203,5556,M,45,6,92103,2198,3,959445515,Modulations (1998),Documentary
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama


In [97]:
ex2_m = data_m.pivot_table(index='영화제목',
                 values= '평점',
                 aggfunc = 'mean')
ex2_m.columns = ['남성평균']
ex2_m

Unnamed: 0_level_0,남성평균
영화제목,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",2.761905
'Night Mother (1986),3.352941
'Til There Was You (1997),2.733333
"'burbs, The (1989)",2.962085
...And Justice for All (1979),3.689024
...,...
"Zed & Two Noughts, A (1985)",3.380952
Zero Effect (1998),3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),3.500000
Zeus and Roxanne (1997),2.357143


In [101]:
data_f=data[data.성별=='F']
data_f

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르,연령대
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama,10
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama,20
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama,20
19,59,F,50,1,55413,1193,4,977934292,One Flew Over the Cuckoo's Nest (1975),Drama,50
...,...,...,...,...,...,...,...,...,...,...,...
1000199,5334,F,56,13,46140,3382,5,960796159,Song of Freedom (1936),Drama,50
1000200,5420,F,1,19,14850,1843,3,960156505,Slappy and the Stinkers (1998),Children's|Comedy,0
1000201,5433,F,35,17,45014,286,3,960240881,Nemesis 2: Nebula (1995),Action|Sci-Fi|Thriller,30
1000202,5494,F,35,17,94306,3530,4,959816296,Smoking/No Smoking (1993),Comedy,30


In [102]:
ex2_f = data_f.pivot_table(index='영화제목',
                 values= '평점',
                 aggfunc = 'mean')
ex2_f.columns = ['여성평균']
ex2_f

Unnamed: 0_level_0,여성평균
영화제목,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",3.375000
'Night Mother (1986),3.388889
'Til There Was You (1997),2.675676
"'burbs, The (1989)",2.793478
...And Justice for All (1979),3.828571
...,...
Your Friends and Neighbors (1998),2.888889
"Zed & Two Noughts, A (1985)",3.500000
Zero Effect (1998),3.864407
Zeus and Roxanne (1997),2.777778


In [93]:
ex2_f=ex2

In [48]:
f_500=ex2[(ex2.개수 >=500)]

In [49]:
m_500=ex2_m[(ex2_m.개수 >=500)]

In [50]:
f_500

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.238901,946
Babe (1995),3.953368,579
Back to the Future (1985),3.932707,639
Being John Malkovich (1999),4.15993,569
Braveheart (1995),4.016484,546
Casablanca (1942),4.30099,505
E.T. the Extra-Terrestrial (1982),4.08985,601
Fargo (1996),4.217656,657
Forrest Gump (1994),4.045031,644
Ghostbusters (1984),3.833962,530


In [56]:
diff=abs(f_500.평균-m_500.평균)

In [57]:
diff[(diff>0)].sort_values(ascending=False)

영화제목
Groundhog Day (1993)                                     0.305796
Saving Private Ryan (1998)                               0.284159
Braveheart (1995)                                        0.281355
Pulp Fiction (1994)                                      0.274883
Star Wars: Episode V - The Empire Strikes Back (1980)    0.238096
Jurassic Park (1993)                                     0.234791
Matrix, The (1999)                                       0.233830
Star Wars: Episode VI - Return of the Jedi (1983)        0.203821
Star Wars: Episode IV - A New Hope (1977)                0.192371
Raiders of the Lost Ark (1981)                           0.188429
E.T. the Extra-Terrestrial (1982)                        0.169586
Casablanca (1942)                                        0.160350
Wizard of Oz, The (1939)                                 0.151892
L.A. Confidential (1997)                                 0.150671
Silence of the Lambs, The (1991)                         0.109990
Ameri

In [91]:
o_500 = data.pivot_table(index='영화제목',
                 values= '평점',
                 aggfunc = ['mean','count'])
o_500.columns = ['평균', '개수']
o_500=o_500[(o_500.개수 >=500)]
o_500

Unnamed: 0_level_0,평균,개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
10 Things I Hate About You (1999),3.422857,700
101 Dalmatians (1961),3.596460,565
12 Angry Men (1957),4.295455,616
"13th Warrior, The (1999)",3.158667,750
"20,000 Leagues Under the Sea (1954)",3.702609,575
...,...,...
"X-Files: Fight the Future, The (1998)",3.492972,996
X-Men (2000),3.820649,1511
You've Got Mail (1998),3.380668,838
Young Frankenstein (1974),4.250629,1193


In [103]:
ex2_f

Unnamed: 0_level_0,여성평균
영화제목,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",3.375000
'Night Mother (1986),3.388889
'Til There Was You (1997),2.675676
"'burbs, The (1989)",2.793478
...And Justice for All (1979),3.828571
...,...
Your Friends and Neighbors (1998),2.888889
"Zed & Two Noughts, A (1985)",3.500000
Zero Effect (1998),3.864407
Zeus and Roxanne (1997),2.777778


In [106]:
tmp= pd.concat([o_500,ex2_f],axis=1,join='inner')
fm= pd.concat([tmp,ex2_m],axis=1,join='inner')
fm

Unnamed: 0_level_0,평균,개수,여성평균,남성평균
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10 Things I Hate About You (1999),3.422857,700,3.646552,3.311966
101 Dalmatians (1961),3.596460,565,3.791444,3.500000
12 Angry Men (1957),4.295455,616,4.184397,4.328421
"13th Warrior, The (1999)",3.158667,750,3.112000,3.168000
"20,000 Leagues Under the Sea (1954)",3.702609,575,3.670103,3.709205
...,...,...,...,...
"X-Files: Fight the Future, The (1998)",3.492972,996,3.489474,3.493797
X-Men (2000),3.820649,1511,3.682310,3.851702
You've Got Mail (1998),3.380668,838,3.542424,3.275591
Young Frankenstein (1974),4.250629,1193,4.289963,4.239177


In [108]:
fm["평점차이"]=abs(fm.여성평균-fm.남성평균)
fm

Unnamed: 0_level_0,평균,개수,여성평균,남성평균,평점차이
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10 Things I Hate About You (1999),3.422857,700,3.646552,3.311966,0.334586
101 Dalmatians (1961),3.596460,565,3.791444,3.500000,0.291444
12 Angry Men (1957),4.295455,616,4.184397,4.328421,0.144024
"13th Warrior, The (1999)",3.158667,750,3.112000,3.168000,0.056000
"20,000 Leagues Under the Sea (1954)",3.702609,575,3.670103,3.709205,0.039102
...,...,...,...,...,...
"X-Files: Fight the Future, The (1998)",3.492972,996,3.489474,3.493797,0.004323
X-Men (2000),3.820649,1511,3.682310,3.851702,0.169391
You've Got Mail (1998),3.380668,838,3.542424,3.275591,0.266834
Young Frankenstein (1974),4.250629,1193,4.289963,4.239177,0.050785


In [109]:
#결과
fm.nlargest(10,'평점차이')

Unnamed: 0_level_0,평균,개수,여성평균,남성평균,평점차이
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dirty Dancing (1987),3.311499,687,3.790378,2.959596,0.830782
"Good, The Bad and The Ugly, The (1966)",4.13382,822,3.494949,4.2213,0.726351
Dumb & Dumber (1994),3.192424,660,2.697987,3.336595,0.638608
Evil Dead II (Dead By Dawn) (1987),3.826642,548,3.297297,3.909283,0.611985
Grease (1978),3.577723,817,3.975265,3.367041,0.608224
Caddyshack (1980),3.846949,967,3.396135,3.969737,0.573602
Animal House (1978),4.053024,1207,3.628906,4.167192,0.538286
"Exorcist, The (1973)",3.955932,885,3.537634,4.067239,0.529605
"Rocky Horror Picture Show, The (1975)",3.29116,1233,3.673016,3.160131,0.512885
Big Trouble in Little China (1986),3.414384,584,2.987952,3.48503,0.497078


In [159]:
ex3 = data.pivot_table(index = '영화제목',columns='성별',values='평점',aggfunc = ['mean','count'])

In [164]:
ex3 = ex3[ex3[('count','F')] +ex3[('count','M')] >=500]

## [실습 #4] 연령대 별로 영화 평점 분석하기
연령대(10대 미만, 10대, 20대, ...50대) 컬럼을 추가한 후, 영화별 연령대별 영화평점 구하기

In [35]:
users.describe()

Unnamed: 0,사용자아이디,연령,직업
count,6040.0,6040.0,6040.0
mean,3020.5,30.639238,8.146854
std,1743.742145,12.895962,6.329511
min,1.0,1.0,0.0
25%,1510.75,25.0,3.0
50%,3020.5,25.0,7.0
75%,4530.25,35.0,14.0
max,6040.0,56.0,20.0


In [58]:
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


In [61]:
round(23,-1)

20

In [68]:
23//10

2

In [69]:
def get_agegroup(X):
    return (X//10)*10

In [70]:
data['연령대']= data.apply(lambda X: get_agegroup(X.연령), axis=1)
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르,연령대
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,50
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,20
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,20
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,50
...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary,10
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama,30
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama,10
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western,10


In [72]:
#결과, column으로 줄 필요 있음
data.pivot_table(index=['영화제목','연령대'],
                 values= '평점',
                 aggfunc = 'mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,평점
영화제목,연령대,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",10,3.000000
"$1,000,000 Duck (1971)",20,3.090909
"$1,000,000 Duck (1971)",30,3.133333
"$1,000,000 Duck (1971)",40,2.000000
"$1,000,000 Duck (1971)",50,2.750000
...,...,...
eXistenZ (1999),10,3.289157
eXistenZ (1999),20,3.234973
eXistenZ (1999),30,3.364865
eXistenZ (1999),40,3.222222


In [None]:
#교수
def calculate_ages(x):
    pass

In [165]:
data.pivot_table(index='영화제목',
                 columns='연령대',
                 values= '평점',
                 aggfunc = 'mean')

연령대,0,10,20,30,40,50
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$1,000,000 Duck (1971)",,3.000000,3.090909,3.133333,2.000000,2.750000
'Night Mother (1986),2.000000,4.666667,3.423077,2.904762,3.833333,3.750000
'Til There Was You (1997),3.500000,2.500000,2.666667,2.900000,2.333333,2.600000
"'burbs, The (1989)",4.500000,3.244444,2.652174,2.818182,2.545455,3.100000
...And Justice for All (1979),3.000000,3.428571,3.724138,3.657143,4.100000,3.674419
...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",1.000000,3.000000,3.375000,3.777778,4.000000,3.000000
Zero Effect (1998),4.125000,3.883333,3.715278,3.608696,3.764706,3.769231
Zero Kelvin (Kjærlighetens kjøtere) (1995),,,,3.500000,,
Zeus and Roxanne (1997),1.500000,2.500000,2.833333,3.500000,1.000000,
