# Movielens 영화 별점 분석

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 데이터프레임 생성

* https://github.com/wesm/pydata-book/tree/2nd-edition/datasets/movielens 사트의 3개 데이터 파일을 읽어서 데이터프레임을 생성
* ratings와 movies를 merge
* 위에서 merge된 내용과 users를 merge

In [4]:
file_ratings = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/ratings.dat'
file_movies = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/movies.dat'
file_users = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/users.dat'

In [5]:
movie_cols = ['movie_id', 'title', 'genres']
rating_cols = ['user_id', 'movie_id', 'rating', 'timestam']
user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code']

In [6]:
movies = pd.read_csv(file_movies, sep='::', encoding='cp1252', 
                     header=None, names=movie_cols, engine='python')

In [7]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [9]:
ratings = pd.read_csv(file_ratings, sep='::', encoding='cp1252',
                      header=None, names=rating_cols, engine='python')

In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestam
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user_id   1000209 non-null  int64
 1   movie_id  1000209 non-null  int64
 2   rating    1000209 non-null  int64
 3   timestam  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [12]:
users = pd.read_csv(file_users, sep='::', encoding='cp1252',
                  header=None, names=user_cols, engine='python')

In [13]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [14]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     6040 non-null   int64 
 1   gender      6040 non-null   object
 2   age         6040 non-null   int64 
 3   occupation  6040 non-null   int64 
 4   zip_code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [15]:
df = pd.merge(left=pd.merge(left=ratings, right=movies),
                            right=users)

In [16]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestam,title,genres,gender,age,occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000209 non-null  int64 
 1   movie_id    1000209 non-null  int64 
 2   rating      1000209 non-null  int64 
 3   timestam    1000209 non-null  int64 
 4   title       1000209 non-null  object
 5   genres      1000209 non-null  object
 6   gender      1000209 non-null  object
 7   age         1000209 non-null  int64 
 8   occupation  1000209 non-null  int64 
 9   zip_code    1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [18]:
df['rating'].describe()

count    1.000209e+06
mean     3.581564e+00
std      1.117102e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

* 평점 평균 상위 10개 영화 제목 

In [19]:
rating_by_title = df.groupby('title')['rating'].mean()

In [20]:
rating_by_title.sort_values(ascending=False).head(n=20)

title
Gate of Heavenly Peace, The (1995)                                     5.000000
Lured (1947)                                                           5.000000
Ulysses (Ulisse) (1954)                                                5.000000
Smashing Time (1967)                                                   5.000000
Follow the Bitch (1998)                                                5.000000
Song of Freedom (1936)                                                 5.000000
Bittersweet Motel (2000)                                               5.000000
Baby, The (1973)                                                       5.000000
One Little Indian (1973)                                               5.000000
Schlafes Bruder (Brother of Sleep) (1995)                              5.000000
I Am Cuba (Soy Cuba/Ya Kuba) (1964)                                    4.800000
Lamerica (1994)                                                        4.750000
Apple, The (Sib) (1998)           

* 평점 평균 하위 10개 영화 제목 

In [21]:
rating_by_title.sort_values(ascending=True).head(n=10)

title
Elstree Calling (1930)                                        1.0
Get Over It (1996)                                            1.0
Venice/Venice (1992)                                          1.0
Windows (1980)                                                1.0
Kestrel's Eye (Falkens öga) (1998)                            1.0
McCullochs, The (1975)                                        1.0
Sleepover (1995)                                              1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973)    1.0
Spring Fever USA (a.k.a. Lauderdale) (1989)                   1.0
Santa with Muscles (1996)                                     1.0
Name: rating, dtype: float64

In [22]:
df.groupby(['title'])['rating'].agg(['count', 'mean']).sort_values(by='mean', ascending=False).head(20)

Unnamed: 0_level_0,count,mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Ulysses (Ulisse) (1954),1,5.0
Lured (1947),1,5.0
Follow the Bitch (1998),1,5.0
Bittersweet Motel (2000),1,5.0
Song of Freedom (1936),1,5.0
One Little Indian (1973),1,5.0
Smashing Time (1967),2,5.0
Schlafes Bruder (Brother of Sleep) (1995),1,5.0
"Gate of Heavenly Peace, The (1995)",3,5.0
"Baby, The (1973)",1,5.0


* 성별(남성/여성) 평점 평균 상위 10개 영화 제목

In [23]:
rating_by_title_gender = df.groupby(['title', 'gender'])['rating'].mean()
rating_by_title_gender

title                                       gender
$1,000,000 Duck (1971)                      F         3.375000
                                            M         2.761905
'Night Mother (1986)                        F         3.388889
                                            M         3.352941
'Til There Was You (1997)                   F         2.675676
                                                        ...   
Zero Kelvin (Kjærlighetens kjøtere) (1995)  M         3.500000
Zeus and Roxanne (1997)                     F         2.777778
                                            M         2.357143
eXistenZ (1999)                             F         3.098592
                                            M         3.289086
Name: rating, Length: 7152, dtype: float64

In [24]:
# multi-index -> columns: unstack
rating_by_title_gender.unstack()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


In [25]:
rating_by_title_gender = df.pivot_table(values= 'rating', index='title', columns='gender', aggfunc='mean')
rating_by_title_gender

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


In [26]:
# 여성 평점 상위 10개 제목 
rating_by_title_gender.sort_values(by='F', ascending=False).head(n=10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Clean Slate (Coup de Torchon) (1981),5.0,3.857143
"Ballad of Narayama, The (Narayama Bushiko) (1958)",5.0,3.428571
Raw Deal (1948),5.0,3.307692
Bittersweet Motel (2000),5.0,
Skipped Parts (2000),5.0,4.0
Lamerica (1994),5.0,4.666667
"Gambler, The (A Játékos) (1997)",5.0,3.166667
"Brother, Can You Spare a Dime? (1975)",5.0,3.642857
Ayn Rand: A Sense of Life (1997),5.0,4.0
24 7: Twenty Four Seven (1997),5.0,3.75


In [27]:
# 남성 평점 상위 10개 영화 제목 
rating_by_title_gender.sort_values(by='M', ascending=False).head(n=10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Schlafes Bruder (Brother of Sleep) (1995),,5.0
Small Wonders (1996),3.333333,5.0
"Gate of Heavenly Peace, The (1995)",5.0,5.0
"Baby, The (1973)",,5.0
Ulysses (Ulisse) (1954),,5.0
Dangerous Game (1993),4.0,5.0
Angela (1995),3.0,5.0
"Bells, The (1926)",4.0,5.0
Smashing Time (1967),,5.0
Follow the Bitch (1998),,5.0


* 1~2명의 사용자가 아니라 많은 사용자들로부터 평가를 받은 영화들 중에서 평점 순위 
    * 영화별 평가 숫자 
    * 많은 사용자 평가 기준 

In [28]:
title_counts = df['title'].value_counts()
title_counts

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
                                                         ... 
Broken Vessels (1998)                                       1
Bloody Child, The (1996)                                    1
Ring, The (1927)                                            1
Slappy and the Stinkers (1998)                              1
Harlem (1993)                                               1
Name: title, Length: 3706, dtype: int64

In [29]:
title_counts.describe()

count    3706.000000
mean      269.889099
std       384.047838
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: title, dtype: float64

In [30]:
# 사용자들로부터 평가를 가장 많이 받은 영화 상위 20개 
title_counts.head(n=20)

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
Men in Black (1997)                                      2538
Raiders of the Lost Ark (1981)                           2514
Fargo (1996)                                             2513
Sixth Sense, The (1999)                                  2459
Braveheart (1995)                                        2443
Shakespeare in Love (1998)                               2369
Princess

In [31]:
# 150명 이상의 사용자들로부터 평가를 받은 영화 제목들
indexer = title_counts[title_counts >= 150].index
indexer

Index(['American Beauty (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Jurassic Park (1993)', 'Saving Private Ryan (1998)',
       'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)',
       'Back to the Future (1985)', 'Silence of the Lambs, The (1991)',
       ...
       'Nixon (1995)', 'Cowboy Way, The (1994)', 'Program, The (1993)',
       'Twelfth Night (1996)', 'Golden Voyage of Sinbad, The (1974)',
       'In the Army Now (1994)', 'Asphalt Jungle, The (1950)',
       'Tales from the Crypt Presents: Bordello of Blood (1996)',
       'Love and Death on Long Island (1997)',
       'Police Academy 5: Assignment: Miami Beach (1988)'],
      dtype='object', length=1683)

In [32]:
indexer.nlevels

1

In [33]:
# 150회 이상 평가를 받은 영화들의 평점 평균
rating_by_title.loc[indexer]

American Beauty (1999)                                     4.317386
Star Wars: Episode IV - A New Hope (1977)                  4.453694
Star Wars: Episode V - The Empire Strikes Back (1980)      4.292977
Star Wars: Episode VI - Return of the Jedi (1983)          4.022893
Jurassic Park (1993)                                       3.763847
                                                             ...   
In the Army Now (1994)                                     2.225166
Asphalt Jungle, The (1950)                                 3.927152
Tales from the Crypt Presents: Bordello of Blood (1996)    2.589404
Love and Death on Long Island (1997)                       3.430464
Police Academy 5: Assignment: Miami Beach (1988)           1.766667
Name: rating, Length: 1683, dtype: float64

In [34]:
# 상위 20개 
rating_by_title.loc[indexer].sort_values(ascending=False).head(n=20)

Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)            4.560510
Shawshank Redemption, The (1994)                                               4.554558
Godfather, The (1972)                                                          4.524966
Close Shave, A (1995)                                                          4.520548
Usual Suspects, The (1995)                                                     4.517106
Schindler's List (1993)                                                        4.510417
Wrong Trousers, The (1993)                                                     4.507937
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)                                  4.491489
Raiders of the Lost Ark (1981)                                                 4.477725
Rear Window (1954)                                                             4.476190
Paths of Glory (1957)                                                          4.473913
Star Wars: Episode IV - A New Ho

In [35]:
rating_by_title_gender.loc[indexer]

gender,F,M
American Beauty (1999),4.238901,4.347301
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307
Star Wars: Episode V - The Empire Strikes Back (1980),4.106481,4.344577
Star Wars: Episode VI - Return of the Jedi (1983),3.865237,4.069058
Jurassic Park (1993),3.579407,3.814197
...,...,...
In the Army Now (1994),2.384615,2.192000
"Asphalt Jungle, The (1950)",3.571429,4.008130
Tales from the Crypt Presents: Bordello of Blood (1996),2.727273,2.565891
Love and Death on Long Island (1997),3.116279,3.555556


In [36]:
# 여성 평점 상위 20위
rating_by_title_gender.loc[indexer].sort_values(by='F', ascending=False)

gender,F,M
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
"General, The (1927)",4.575758,4.329480
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.572650,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
...,...,...
Friday the 13th Part 3: 3D (1982),1.758621,2.017647
Showgirls (1995),1.709091,2.166667
Friday the 13th: The Final Chapter (1984),1.636364,2.258503
Barb Wire (1996),1.585366,2.100386


In [37]:
# 남성 평점 상위 10개
rating_by_title_gender.loc[indexer].sort_values(by='M', ascending=False)

gender,F,M
"Godfather, The (1972)",4.314700,4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628
"Shawshank Redemption, The (1994)",4.539075,4.560625
Raiders of the Lost Ark (1981),4.332168,4.520597
"Usual Suspects, The (1995)",4.513317,4.518248
...,...,...
Stop! Or My Mom Will Shoot (1992),1.794872,1.778571
Police Academy 5: Assignment: Miami Beach (1988),1.935484,1.722689
Home Alone 3 (1997),2.486486,1.683761
Battlefield Earth (2000),1.574468,1.616949


* 성별 선호도 차이

In [38]:
# diff = F - M 파생 변수 추가
rating_by_title_gender['diff'] = rating_by_title_gender['F'] - rating_by_title_gender['M']

In [39]:
rating_by_title_gender

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905,0.613095
'Night Mother (1986),3.388889,3.352941,0.035948
'Til There Was You (1997),2.675676,2.733333,-0.057658
"'burbs, The (1989)",2.793478,2.962085,-0.168607
...And Justice for All (1979),3.828571,3.689024,0.139547
...,...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952,0.119048
Zero Effect (1998),3.864407,3.723140,0.141266
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000,
Zeus and Roxanne (1997),2.777778,2.357143,0.420635


In [40]:
# 150회 이상 평가를 받은 영화들 중에서 여성의 선호도가 높은 영화 상위 20개 
rating_by_title_gender.loc[indexer].sort_values(by='diff', ascending=False).head(10)

gender,F,M,diff
Dirty Dancing (1987),3.790378,2.959596,0.830782
Home Alone 3 (1997),2.486486,1.683761,0.802726
"To Wong Foo, Thanks for Everything! Julie Newmar (1995)",3.486842,2.795276,0.691567
Jumpin' Jack Flash (1986),3.254717,2.578358,0.676359
Dracula: Dead and Loving It (1995),2.892857,2.25,0.642857
Grease (1978),3.975265,3.367041,0.608224
Police Academy 4: Citizens on Patrol (1987),2.40625,1.802817,0.603433
Brokedown Palace (1999),3.3125,2.723577,0.588923
"Relic, The (1997)",3.309524,2.723077,0.586447
Angels in the Outfield (1994),3.1625,2.580838,0.581662


In [41]:
# 150회 이상 평가를 받은 영화들 중에서 남성의 선호도가 높은 영화 상위 20개 
rating_by_title_gender.loc[indexer].sort_values(by='diff').head(10)

gender,F,M,diff
Lifeforce (1985),2.25,2.994152,-0.744152
Quest for Fire (1981),2.578947,3.309677,-0.73073
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,-0.726351
No Escape (1994),2.3,2.994048,-0.694048
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,-0.676359
Tora! Tora! Tora! (1970),3.090909,3.737705,-0.646796
Up in Smoke (1978),2.944444,3.585227,-0.640783
Dumb & Dumber (1994),2.697987,3.336595,-0.638608
Friday the 13th: The Final Chapter (1984),1.636364,2.258503,-0.62214
"Longest Day, The (1962)",3.411765,4.031447,-0.619682


* 남녀 평점 평균의 차이가 큰 영화들 중에서, 여성들이 더 높은 평점을 준 영화 상위 50개의 영화에 자주 등장하는 장르 5개를 찾아보세요

In [45]:
# 150회 이상 평가를 받은 영화를 diff의 내림차순 정렬을 해서 상위 50개를 선택
diff_female_top50 = rating_by_title_gender.loc[indexer].sort_values(by='diff', ascending=False).head(50)
diff_female_top50.index

Index(['Dirty Dancing (1987)', 'Home Alone 3 (1997)',
       'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
       'Jumpin' Jack Flash (1986)', 'Dracula: Dead and Loving It (1995)',
       'Grease (1978)', 'Police Academy 4: Citizens on Patrol (1987)',
       'Brokedown Palace (1999)', 'Relic, The (1997)',
       'Angels in the Outfield (1994)', 'Little Women (1994)',
       'Son in Law (1993)', 'Other Sister, The (1999)',
       'Steel Magnolias (1989)', 'Mirror Has Two Faces, The (1996)',
       'Anastasia (1997)', 'Rocky Horror Picture Show, The (1975)',
       'Santa Claus: The Movie (1985)', 'Color Purple, The (1985)',
       'Nell (1994)', 'Waiting to Exhale (1995)', 'Suspicion (1941)',
       'Baby Geniuses (1999)', 'Wing Commander (1999)',
       'Age of Innocence, The (1993)', 'Free Willy (1993)',
       'French Kiss (1995)', 'Gigi (1958)', 'Affair to Remember, An (1957)',
       'Little Shop of Horrors, The (1960)', '200 Cigarettes (1999)',
       'Guys and Dolls 

In [47]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [50]:
# movies 데이터프레임에서 title 컬럼을 index로 만듦
movies_with_index = movies.set_index(keys='title')
movies_with_index

Unnamed: 0_level_0,movie_id,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1,Animation|Children's|Comedy
Jumanji (1995),2,Adventure|Children's|Fantasy
Grumpier Old Men (1995),3,Comedy|Romance
Waiting to Exhale (1995),4,Comedy|Drama
Father of the Bride Part II (1995),5,Comedy
...,...,...
Meet the Parents (2000),3948,Comedy
Requiem for a Dream (2000),3949,Drama
Tigerland (2000),3950,Drama
Two Family House (2000),3951,Drama


In [64]:
diff_female_movies = movies_with_index.loc[diff_female_top50.index]

In [65]:
diff_female_movies.head()

Unnamed: 0,movie_id,genres
Dirty Dancing (1987),1088,Musical|Romance
Home Alone 3 (1997),1707,Children's|Comedy
"To Wong Foo, Thanks for Everything! Julie Newmar (1995)",203,Comedy
Jumpin' Jack Flash (1986),2468,Action|Comedy|Romance|Thriller
Dracula: Dead and Loving It (1995),12,Comedy|Horror


In [66]:
diff_female_movies.tail()

Unnamed: 0,movie_id,genres
Sommersby (1993),2875,Drama|Mystery|Romance
Nighthawks (1981),3029,Action|Drama
"Skulls, The (2000)",3484,Thriller
Spellbound (1945),931,Mystery|Romance|Thriller
"Corrina, Corrina (1994)",351,Comedy|Drama|Romance


In [74]:
test = []
test.append(['a', 'b'])
test

[['a', 'b']]

In [73]:
test = []
test.extend(['a', 'b'])
test

['a', 'b']

In [98]:
diff_female_genres = []
for genres in diff_female_movies['genres']:
    # print(genres.split(sep='|'))
    diff_female_genres.extend(genres.split(sep='|'))

diff_female_genres = pd.Series(diff_female_genres)
diff_female_genres.value_counts()

Comedy         23
Drama          17
Romance        15
Musical        10
Children's      7
Thriller        5
Mystery         4
Horror          4
Action          4
Adventure       4
War             3
Sci-Fi          2
Documentary     1
Fantasy         1
Animation       1
dtype: int64

* 남녀 평점 평균의 차이가 큰 영화들 중에서, 남성들이 더 높은 평점을 준 영화 상위 50개의 영화에 자주 등장하는 장르 5개를 찾아보세요

In [109]:
diff_male_top50 = rating_by_title_gender.loc[indexer].sort_values(by='diff', ascending=True).head(50)
diff_male_top50.index

Index(['Lifeforce (1985)', 'Quest for Fire (1981)',
       'Good, The Bad and The Ugly, The (1966)', 'No Escape (1994)',
       'Kentucky Fried Movie, The (1977)', 'Tora! Tora! Tora! (1970)',
       'Up in Smoke (1978)', 'Dumb & Dumber (1994)',
       'Friday the 13th: The Final Chapter (1984)', 'Longest Day, The (1962)',
       'Cable Guy, The (1996)', 'Evil Dead II (Dead By Dawn) (1987)',
       'Once Upon a Time in the West (1969)', 'Hidden, The (1987)',
       'Rocky III (1982)', 'Transformers: The Movie, The (1986)',
       'Nutty Professor II: The Klumps (2000)', 'Caddyshack (1980)',
       'For a Few Dollars More (1965)', 'Porky's (1981)',
       'Animal House (1978)', 'Exorcist, The (1973)', 'Runaway Train (1985)',
       'Fright Night (1985)', 'Barb Wire (1996)', 'Screamers (1995)',
       'Rocky II (1979)', 'Beavis and Butt-head Do America (1996)',
       'Big Trouble in Little China (1986)', 'From Dusk Till Dawn (1996)',
       'Wild Bunch, The (1969)', 'In the Mouth of Madn

In [80]:
diff_male_movies = movies_with_index.loc[diff_male_top50.index]

In [81]:
diff_male_movies.head()

Unnamed: 0,movie_id,genres
Lifeforce (1985),2377,Horror|Sci-Fi
Quest for Fire (1981),3036,Adventure
"Good, The Bad and The Ugly, The (1966)",1201,Action|Western
No Escape (1994),504,Action|Sci-Fi
"Kentucky Fried Movie, The (1977)",3760,Comedy


In [107]:
diff_male_genres = []
for genres in diff_male_movies['genres']:
    diff_male_genres.extend(genres.split(sep='|'))

diff_male_genres = pd.Series(diff_male_genres)
diff_male_genres.value_counts().head(5)

Comedy      20
Action      15
Horror      10
Drama       10
Thriller     8
dtype: int64

* 150회 이상 평가를 받은 여성 평점 평균 상위 50개의 영화에 자주 등장하는 장르 5개를 찾아보세요

In [88]:
female_top50 = rating_by_title_gender.loc[indexer].sort_values(by='F', ascending=False).head(n=50)

In [89]:
female_top50.head()

gender,F,M,diff
"Close Shave, A (1995)",4.644444,4.473795,0.17065
"Wrong Trousers, The (1993)",4.588235,4.478261,0.109974
"General, The (1927)",4.575758,4.32948,0.246278
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589,0.10806
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075,0.178032


In [90]:
female_top50.tail()

gender,F,M,diff
Stop Making Sense (1984),4.333333,4.203297,0.130037
Singin' in the Rain (1952),4.333333,4.251101,0.082232
All About My Mother (Todo Sobre Mi Madre) (1999),4.333333,3.944,0.389333
Raiders of the Lost Ark (1981),4.332168,4.520597,-0.188429
Citizen Kane (1941),4.332143,4.407895,-0.075752


In [94]:
female_movies = movies_with_index.loc[female_top50.index]
female_movies.head()

Unnamed: 0,movie_id,genres
"Close Shave, A (1995)",745,Animation|Comedy|Thriller
"Wrong Trousers, The (1993)",1148,Animation|Comedy
"General, The (1927)",3022,Comedy
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),922,Film-Noir
Wallace & Gromit: The Best of Aardman Animation (1996),720,Animation


In [106]:
female_genres = []
for genres in female_movies['genres']:
    female_genres.extend(genres.split(sep='|'))

female_genres = pd.Series(female_genres)
female_genres.value_counts().head(6)

Drama        22
Comedy       14
Thriller     11
Romance       7
Film-Noir     6
War           6
dtype: int64

* 150회 이상 평가를 받은 남성 평점 평균 상위 50개의 영화에 자주 등장하는 장르 5개를 찾아보세요

In [99]:
male_top50 = rating_by_title_gender.loc[indexer].sort_values(by='M', ascending=False).head(n=50)

In [101]:
male_top50.head()

gender,F,M,diff
"Godfather, The (1972)",4.3147,4.583333,-0.268634
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628,-0.095496
"Shawshank Redemption, The (1994)",4.539075,4.560625,-0.02155
Raiders of the Lost Ark (1981),4.332168,4.520597,-0.188429
"Usual Suspects, The (1995)",4.513317,4.518248,-0.004931


In [102]:
male_top50.index

Index(['Godfather, The (1972)',
       'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)',
       'Shawshank Redemption, The (1994)', 'Raiders of the Lost Ark (1981)',
       'Usual Suspects, The (1995)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Schindler's List (1993)',
       'Paths of Glory (1957)', 'Wrong Trousers, The (1993)',
       'Close Shave, A (1995)', 'Rear Window (1954)',
       'Double Indemnity (1944)',
       'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
       'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)', 'Casablanca (1942)',
       'Third Man, The (1949)', 'Lawrence of Arabia (1962)',
       'Godfather: Part II, The (1974)',
       'One Flew Over the Cuckoo's Nest (1975)', 'Maltese Falcon, The (1941)',
       'Great Escape, The (1963)', 'Citizen Kane (1941)', 'Yojimbo (1961)',
       'Bridge on the River Kwai, The (1957)', 'Saving Private Ryan (1998)',
       'North by Northwest (1959)',
       'Wallac

In [104]:
male_movies = movies_with_index.loc[male_top50.index]
male_movies.head()

Unnamed: 0,movie_id,genres
"Godfather, The (1972)",858,Action|Crime|Drama
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),2019,Action|Drama
"Shawshank Redemption, The (1994)",318,Drama
Raiders of the Lost Ark (1981),1198,Action|Adventure
"Usual Suspects, The (1995)",50,Crime|Thriller


In [105]:
male_genres = []
for genres in male_movies['genres']:
    male_genres.extend(genres.split(sep='|'))

male_genres = pd.Series(male_genres)
male_genres.value_counts().head()

Drama       26
War         11
Thriller    10
Action       9
Comedy       8
dtype: int64