# MovieLens Rating Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create DataFrame
1. Read CSV and Create DataFrame
2. Merge ratings df and movies df
3. Merge 2 and users

## File URL

In [2]:
movies_data = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/movies.dat'
ratings_data = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/ratings.dat'
users_data = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/users.dat'

In [3]:
# feature / variables / column names
movie_cols = ['movie_id', 'title', 'genres']
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code']

In [4]:
movies = pd.read_csv(movies_data, sep = '::', encoding = 'cp1252',
                     header = None, names = movie_cols, engine = 'python')

In [5]:
ratings = pd.read_csv(ratings_data, sep = '::', encoding = 'cp1252',
                      header = None, names = rating_cols, engine = 'python')

In [6]:
users = pd.read_csv(users_data, sep = '::', encoding = 'cp1252',
                    header = None, names = user_cols, engine = 'python')

# Figure out Data Structure

In [7]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     6040 non-null   int64 
 1   gender      6040 non-null   object
 2   age         6040 non-null   int64 
 3   occupation  6040 non-null   int64 
 4   zip_code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


# Merge ratings df, movies df, users df

In [11]:
# inner join
df = pd.merge(left = pd.merge(left = ratings, right = movies), right = users)

In [12]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000209 non-null  int64 
 1   movie_id    1000209 non-null  int64 
 2   rating      1000209 non-null  int64 
 3   timestamp   1000209 non-null  int64 
 4   title       1000209 non-null  object
 5   genres      1000209 non-null  object
 6   gender      1000209 non-null  object
 7   age         1000209 non-null  int64 
 8   occupation  1000209 non-null  int64 
 9   zip_code    1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 116.2+ MB


In [14]:
df['rating'].describe()

count    1.000209e+06
mean     3.581564e+00
std      1.117102e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

# Find Top 10

## Top 10 ratings movie title

In [15]:
# mean ratings by movies > sort by descending > top 10
rating_by_title = df.groupby('title')['rating'].mean()
rating_by_title #> pd.Series

title
$1,000,000 Duck (1971)                        3.027027
'Night Mother (1986)                          3.371429
'Til There Was You (1997)                     2.692308
'burbs, The (1989)                            2.910891
...And Justice for All (1979)                 3.713568
                                                ...   
Zed & Two Noughts, A (1985)                   3.413793
Zero Effect (1998)                            3.750831
Zero Kelvin (Kjærlighetens kjøtere) (1995)    3.500000
Zeus and Roxanne (1997)                       2.521739
eXistenZ (1999)                               3.256098
Name: rating, Length: 3706, dtype: float64

In [16]:
rating_by_title.sort_values(ascending = False).head(10)

title
Ulysses (Ulisse) (1954)                      5.0
Lured (1947)                                 5.0
Follow the Bitch (1998)                      5.0
Bittersweet Motel (2000)                     5.0
Song of Freedom (1936)                       5.0
One Little Indian (1973)                     5.0
Smashing Time (1967)                         5.0
Schlafes Bruder (Brother of Sleep) (1995)    5.0
Gate of Heavenly Peace, The (1995)           5.0
Baby, The (1973)                             5.0
Name: rating, dtype: float64

## Bottom 10 ratings movie title

In [17]:
rating_by_title.sort_values().head(10)

title
Elstree Calling (1930)                                        1.0
Get Over It (1996)                                            1.0
Venice/Venice (1992)                                          1.0
Windows (1980)                                                1.0
Kestrel's Eye (Falkens öga) (1998)                            1.0
McCullochs, The (1975)                                        1.0
Sleepover (1995)                                              1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973)    1.0
Spring Fever USA (a.k.a. Lauderdale) (1989)                   1.0
Santa with Muscles (1996)                                     1.0
Name: rating, dtype: float64

In [18]:
df.groupby('title')['rating'].agg(['count', 'mean']).sort_values(by = 'mean', ascending = False).head(10)

Unnamed: 0_level_0,count,mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Ulysses (Ulisse) (1954),1,5.0
Lured (1947),1,5.0
Follow the Bitch (1998),1,5.0
Bittersweet Motel (2000),1,5.0
Song of Freedom (1936),1,5.0
One Little Indian (1973),1,5.0
Smashing Time (1967),2,5.0
Schlafes Bruder (Brother of Sleep) (1995),1,5.0
"Gate of Heavenly Peace, The (1995)",3,5.0
"Baby, The (1973)",1,5.0


## Top 10 ratings movie title by gender

### using groupby() method

In [19]:
rating_by_title_gender = df.groupby(['title', 'gender'])['rating'].mean()
rating_by_title_gender

title                                       gender
$1,000,000 Duck (1971)                      F         3.375000
                                            M         2.761905
'Night Mother (1986)                        F         3.388889
                                            M         3.352941
'Til There Was You (1997)                   F         2.675676
                                                        ...   
Zero Kelvin (Kjærlighetens kjøtere) (1995)  M         3.500000
Zeus and Roxanne (1997)                     F         2.777778
                                            M         2.357143
eXistenZ (1999)                             F         3.098592
                                            M         3.289086
Name: rating, Length: 7152, dtype: float64

In [20]:
# unstack() : change multi-index to columns
rating_by_title_gender.unstack()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


### using pivot_table() method

In [21]:
rating_by_title_gender = df.pivot_table(values = 'rating', index = 'title', columns = 'gender')
rating_by_title_gender

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


### Top 10 ratings movie title for female / for male

In [23]:
rating_by_title_gender.sort_values(by = 'F', ascending = False).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Clean Slate (Coup de Torchon) (1981),5.0,3.857143
"Ballad of Narayama, The (Narayama Bushiko) (1958)",5.0,3.428571
Raw Deal (1948),5.0,3.307692
Bittersweet Motel (2000),5.0,
Skipped Parts (2000),5.0,4.0
Lamerica (1994),5.0,4.666667
"Gambler, The (A Játékos) (1997)",5.0,3.166667
"Brother, Can You Spare a Dime? (1975)",5.0,3.642857
Ayn Rand: A Sense of Life (1997),5.0,4.0
24 7: Twenty Four Seven (1997),5.0,3.75


In [24]:
rating_by_title_gender.sort_values(by = 'M', ascending = False).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Schlafes Bruder (Brother of Sleep) (1995),,5.0
Small Wonders (1996),3.333333,5.0
"Gate of Heavenly Peace, The (1995)",5.0,5.0
"Baby, The (1973)",,5.0
Ulysses (Ulisse) (1954),,5.0
Dangerous Game (1993),4.0,5.0
Angela (1995),3.0,5.0
"Bells, The (1926)",4.0,5.0
Smashing Time (1967),,5.0
Follow the Bitch (1998),,5.0


# Remove low evaluation count
* evaluation count by movie
* many evaluation

In [25]:
title_counts = df['title'].value_counts()
title_counts

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
                                                         ... 
Waltzes from Vienna (1933)                                  1
Happy Weekend (1996)                                        1
Trois (2000)                                                1
Beloved/Friend (Amigo/Amado) (1999)                         1
Terror in a Texas Town (1958)                               1
Name: title, Length: 3706, dtype: int64

In [26]:
title_counts.describe()

count    3706.000000
mean      269.889099
std       384.047838
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: title, dtype: float64

In [27]:
# top 20 movie : most evaluated by users
title_counts.head(n = 20)

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
Men in Black (1997)                                      2538
Raiders of the Lost Ark (1981)                           2514
Fargo (1996)                                             2513
Sixth Sense, The (1999)                                  2459
Braveheart (1995)                                        2443
Shakespeare in Love (1998)                               2369
Princess

In [28]:
# movies that evaluated by more than 150 users
indexer = title_counts[title_counts >= 150].index
indexer

Index(['American Beauty (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Jurassic Park (1993)', 'Saving Private Ryan (1998)',
       'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)',
       'Back to the Future (1985)', 'Silence of the Lambs, The (1991)',
       ...
       'Alligator (1980)', 'Twelfth Night (1996)', 'Program, The (1993)',
       'Cowboy Way, The (1994)', 'Golden Voyage of Sinbad, The (1974)',
       'Tales from the Crypt Presents: Bordello of Blood (1996)',
       'In the Army Now (1994)', 'Love and Death on Long Island (1997)',
       'Asphalt Jungle, The (1950)',
       'Police Academy 5: Assignment: Miami Beach (1988)'],
      dtype='object', length=1683)

In [29]:
# ratings of movies evaluated by more than 150 users
rating_by_title.loc[indexer]

American Beauty (1999)                                     4.317386
Star Wars: Episode IV - A New Hope (1977)                  4.453694
Star Wars: Episode V - The Empire Strikes Back (1980)      4.292977
Star Wars: Episode VI - Return of the Jedi (1983)          4.022893
Jurassic Park (1993)                                       3.763847
                                                             ...   
Tales from the Crypt Presents: Bordello of Blood (1996)    2.589404
In the Army Now (1994)                                     2.225166
Love and Death on Long Island (1997)                       3.430464
Asphalt Jungle, The (1950)                                 3.927152
Police Academy 5: Assignment: Miami Beach (1988)           1.766667
Name: rating, Length: 1683, dtype: float64

In [30]:
# Top 20 of ratings of movies evaluated by more than 150 users
rating_by_title.loc[indexer].sort_values(ascending = False).head(20)

Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)            4.560510
Shawshank Redemption, The (1994)                                               4.554558
Godfather, The (1972)                                                          4.524966
Close Shave, A (1995)                                                          4.520548
Usual Suspects, The (1995)                                                     4.517106
Schindler's List (1993)                                                        4.510417
Wrong Trousers, The (1993)                                                     4.507937
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)                                  4.491489
Raiders of the Lost Ark (1981)                                                 4.477725
Rear Window (1954)                                                             4.476190
Paths of Glory (1957)                                                          4.473913
Star Wars: Episode IV - A New Ho

In [32]:
# bottom 10 of ratings of movies evaluated by more than 150 users
rating_by_title.loc[indexer].sort_values().head(10)

Battlefield Earth (2000)                            1.611111
Baby Geniuses (1999)                                1.701220
Police Academy 5: Assignment: Miami Beach (1988)    1.766667
Stop! Or My Mom Will Shoot (1992)                   1.782123
Jaws 3-D (1983)                                     1.852381
Speed 2: Cruise Control (1997)                      1.871935
Spice World (1997)                                  1.873684
Super Mario Bros. (1993)                            1.874286
Home Alone 3 (1997)                                 1.876623
Superman IV: The Quest for Peace (1987)             1.888554
Name: rating, dtype: float64

In [33]:
rating_by_title_gender.loc[indexer]

gender,F,M
American Beauty (1999),4.238901,4.347301
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307
Star Wars: Episode V - The Empire Strikes Back (1980),4.106481,4.344577
Star Wars: Episode VI - Return of the Jedi (1983),3.865237,4.069058
Jurassic Park (1993),3.579407,3.814197
...,...,...
Tales from the Crypt Presents: Bordello of Blood (1996),2.727273,2.565891
In the Army Now (1994),2.384615,2.192000
Love and Death on Long Island (1997),3.116279,3.555556
"Asphalt Jungle, The (1950)",3.571429,4.008130


In [35]:
# Top 10 of ratings of movies evaluated by more than 150 users : for Female
rating_by_title_gender.loc[indexer].sort_values(by = 'F', ascending = False).head(10)

gender,F,M
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
"General, The (1927)",4.575758,4.32948
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Grand Illusion (Grande illusion, La) (1937)",4.560976,4.266129
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611


In [36]:
# Top 10 of ratings of movies evaluated by more than 150 users : for Male
rating_by_title_gender.loc[indexer].sort_values(by = 'M', ascending = False).head(10)

gender,F,M
"Godfather, The (1972)",4.3147,4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628
"Shawshank Redemption, The (1994)",4.539075,4.560625
Raiders of the Lost Ark (1981),4.332168,4.520597
"Usual Suspects, The (1995)",4.513317,4.518248
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307
Schindler's List (1993),4.562602,4.491415
Paths of Glory (1957),4.392857,4.485149
"Wrong Trousers, The (1993)",4.588235,4.478261
"Close Shave, A (1995)",4.644444,4.473795


## preference difference by gender

In [38]:
# diff : F-M derived variable
rating_by_title_gender['diff'] = rating_by_title_gender['F'] - rating_by_title_gender['M']

In [39]:
rating_by_title_gender

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905,0.613095
'Night Mother (1986),3.388889,3.352941,0.035948
'Til There Was You (1997),2.675676,2.733333,-0.057658
"'burbs, The (1989)",2.793478,2.962085,-0.168607
...And Justice for All (1979),3.828571,3.689024,0.139547
...,...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952,0.119048
Zero Effect (1998),3.864407,3.723140,0.141266
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000,
Zeus and Roxanne (1997),2.777778,2.357143,0.420635


In [41]:
# in movies evaluated by more than 150 users, Top 10 movies that more preferred by female
rating_by_title_gender.loc[indexer].sort_values(by = 'diff', ascending = False).head(10)

gender,F,M,diff
Dirty Dancing (1987),3.790378,2.959596,0.830782
Home Alone 3 (1997),2.486486,1.683761,0.802726
"To Wong Foo, Thanks for Everything! Julie Newmar (1995)",3.486842,2.795276,0.691567
Jumpin' Jack Flash (1986),3.254717,2.578358,0.676359
Dracula: Dead and Loving It (1995),2.892857,2.25,0.642857
Grease (1978),3.975265,3.367041,0.608224
Police Academy 4: Citizens on Patrol (1987),2.40625,1.802817,0.603433
Brokedown Palace (1999),3.3125,2.723577,0.588923
"Relic, The (1997)",3.309524,2.723077,0.586447
Angels in the Outfield (1994),3.1625,2.580838,0.581662


In [42]:
# in movies evaluated by more than 150 users, Top 10 movies that more preferred by male
rating_by_title_gender.loc[indexer].sort_values(by = 'diff').head(10)

gender,F,M,diff
Lifeforce (1985),2.25,2.994152,-0.744152
Quest for Fire (1981),2.578947,3.309677,-0.73073
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,-0.726351
No Escape (1994),2.3,2.994048,-0.694048
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,-0.676359
Tora! Tora! Tora! (1970),3.090909,3.737705,-0.646796
Up in Smoke (1978),2.944444,3.585227,-0.640783
Dumb & Dumber (1994),2.697987,3.336595,-0.638608
Friday the 13th: The Final Chapter (1984),1.636364,2.258503,-0.62214
"Longest Day, The (1962)",3.411765,4.031447,-0.619682


In [43]:
# in movies evaluated by more than 150 users,
# in movies that has big diff,
# more preferred by female
# Find top 5 genre in top 50 movies
diff_female_top50 = rating_by_title_gender.loc[indexer].sort_values(by = 'diff', ascending = False).head(50)
diff_female_top50.index

Index(['Dirty Dancing (1987)', 'Home Alone 3 (1997)',
       'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
       'Jumpin' Jack Flash (1986)', 'Dracula: Dead and Loving It (1995)',
       'Grease (1978)', 'Police Academy 4: Citizens on Patrol (1987)',
       'Brokedown Palace (1999)', 'Relic, The (1997)',
       'Angels in the Outfield (1994)', 'Little Women (1994)',
       'Son in Law (1993)', 'Other Sister, The (1999)',
       'Steel Magnolias (1989)', 'Mirror Has Two Faces, The (1996)',
       'Anastasia (1997)', 'Rocky Horror Picture Show, The (1975)',
       'Santa Claus: The Movie (1985)', 'Color Purple, The (1985)',
       'Nell (1994)', 'Waiting to Exhale (1995)', 'Suspicion (1941)',
       'Baby Geniuses (1999)', 'Wing Commander (1999)',
       'Age of Innocence, The (1993)', 'Free Willy (1993)',
       'French Kiss (1995)', 'Gigi (1958)', 'Affair to Remember, An (1957)',
       'Little Shop of Horrors, The (1960)', '200 Cigarettes (1999)',
       'Guys and Dolls 

In [44]:
# in movies df, change title column to index.
movies_with_index = movies.set_index(keys = 'title')
movies_with_index

Unnamed: 0_level_0,movie_id,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1,Animation|Children's|Comedy
Jumanji (1995),2,Adventure|Children's|Fantasy
Grumpier Old Men (1995),3,Comedy|Romance
Waiting to Exhale (1995),4,Comedy|Drama
Father of the Bride Part II (1995),5,Comedy
...,...,...
Meet the Parents (2000),3948,Comedy
Requiem for a Dream (2000),3949,Drama
Tigerland (2000),3950,Drama
Two Family House (2000),3951,Drama


In [45]:
diff_female_movies = movies_with_index.loc[diff_female_top50.index]

In [46]:
diff_female_movies.head()

Unnamed: 0,movie_id,genres
Dirty Dancing (1987),1088,Musical|Romance
Home Alone 3 (1997),1707,Children's|Comedy
"To Wong Foo, Thanks for Everything! Julie Newmar (1995)",203,Comedy
Jumpin' Jack Flash (1986),2468,Action|Comedy|Romance|Thriller
Dracula: Dead and Loving It (1995),12,Comedy|Horror


In [47]:
diff_female_genres = []

for genres in diff_female_movies['genres']:
    diff_female_genres.extend(genres.split(sep = '|'))

diff_female_genres = pd.Series(diff_female_genres)
diff_female_genres.value_counts().head()

Comedy        23
Drama         17
Romance       15
Musical       10
Children's     7
dtype: int64