# MovieLens 1M Dataset

In [None]:
%%bash
ls data/movielens

In [None]:
%%bash
head data/movielens/users.csv

userId;gender;age;occupation;zip-code
1;F;1;10;48067
2;M;56;16;70072
3;M;25;15;55117
4;M;45;7;2460
5;M;25;20;55455
6;F;50;9;55117
7;M;35;1;6810
8;M;25;12;11413
9;M;25;17;61614


In [None]:
import pandas as pd

In [None]:
path = "data/movielens/"
users = pd.read_csv(path+"users.csv", sep=";")
ratings = pd.read_csv(path+"ratings.csv", sep=";")
movies = pd.read_csv(path+"movies.csv", sep=";", encoding="latin")

In [None]:
for df in [users, ratings, movies]:
    print(df.head())
    print("++++++++++++++++++++")

   userId gender  age  occupation zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7     2460
4       5      M   25          20    55455
++++++++++++++++++++
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
++++++++++++++++++++
   movieId                               title                        genres  \
0        1                    Toy Story (1995)   Animation|Children's|Comedy   
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2        3             Grumpier Old Men (1995)                Comedy|Romance   
3        4            Waiting to Exhale (1995)                  Comedy|Drama   
4        5  Father of the Bride Part II (1995)                        Co

In [None]:
data = pd.merge(pd.merge(ratings, users), movies)

In [None]:
data.drop('Unnamed: 3', axis=1, inplace=True)

In [None]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
data.iloc[0]

userId                                             1
movieId                                         1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip-code                                       48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

In [None]:
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

In [None]:
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


In [None]:
ratings_by_title = data.groupby('title').size()

In [None]:
ratings_by_title.head()

title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
dtype: int64

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [None]:
for title in active_titles[:20]:
    print(title)

'burbs, The (1989)
10 Things I Hate About You (1999)
101 Dalmatians (1961)
101 Dalmatians (1996)
12 Angry Men (1957)
13th Warrior, The (1999)
2 Days in the Valley (1996)
20,000 Leagues Under the Sea (1954)
2001
2010 (1984)
28 Days (2000)
39 Steps, The (1935)
54 (1998)
7th Voyage of Sinbad, The (1958)
8MM (1999)
About Last Night... (1986)
Absent Minded Professor, The (1961)
Absolute Power (1997)
Abyss, The (1989)
Ace Ventura


In [None]:
mean_ratings = mean_ratings.loc[active_titles]

In [None]:
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.500000
101 Dalmatians (1996),3.240000,2.911215
12 Angry Men (1957),4.184397,4.328421
...,...,...
Young Guns (1988),3.371795,3.425620
Young Guns II (1990),2.934783,2.904025
Young Sherlock Holmes (1985),3.514706,3.363344
Zero Effect (1998),3.864407,3.723140


In [None]:
top_female_ratings = mean_ratings.sort_values(by="F", ascending=False)

In [None]:
top_female_ratings.head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit,4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611
Creature Comforts (1990),4.513889,4.272277
"Usual Suspects, The (1995)",4.513317,4.518248


In [None]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

In [None]:
sorted_by_diff = mean_ratings.sort_values(by="diff")
sorted_by_diff.head(10)# Women Prefered Films

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224
Little Women (1994),3.870588,3.321739,-0.548849
Steel Magnolias (1989),3.901734,3.365957,-0.535777
Anastasia (1997),3.8,3.281609,-0.518391
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,-0.512885
"Color Purple, The (1985)",4.158192,3.659341,-0.498851
"Age of Innocence, The (1993)",3.827068,3.339506,-0.487561
Free Willy (1993),2.921348,2.438776,-0.482573


In [None]:
sorted_by_diff[::-1].head(10) # Men Prefered Films

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608
"Longest Day, The (1962)",3.411765,4.031447,0.619682
"Cable Guy, The (1996)",2.25,2.863787,0.613787
Evil Dead II (Dead By Dawn) (1987),3.297297,3.909283,0.611985
"Hidden, The (1987)",3.137931,3.745098,0.607167
Rocky III (1982),2.361702,2.943503,0.581801
Caddyshack (1980),3.396135,3.969737,0.573602
For a Few Dollars More (1965),3.409091,3.953795,0.544704


In [None]:
rating_std_by_title = data.groupby('title')['rating'].std()

In [None]:
ratings_std_by_title = rating_std_by_title.loc[active_titles]

In [None]:
rating_std_by_title.sort_values(ascending=False).head(10)

title
Foreign Student (1994)                                             2.828427
Criminal Lovers (Les Amants Criminels) (1999)                      2.309401
Talk of Angels (1998)                                              2.121320
Tokyo Fist (1995)                                                  2.121320
Paralyzing Fear                                                    2.121320
Better Living (1998)                                               2.121320
Identification of a Woman (Identificazione di una donna) (1982)    2.121320
Neon Bible, The (1995)                                             2.121320
Sunset Park (1996)                                                 2.121320
Living Dead Girl, The (La Morte Vivante) (1982)                    2.121320
Name: rating, dtype: float64