# Movie Lens 
The data sets have 1m ratings based on 6000 users, on 4000 movies. 

## Prepare the data

In [76]:
# import libraries
import pandas as pd
import numpy as np

In [77]:
# define data paths
path_users = '../datasets/movielens/users.dat'
path_ratings = '../datasets/movielens/ratings.dat'
path_movies = '../datasets/movielens/movies.dat'


In [78]:
# Load the three tables into dataframes
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(path_users, sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(path_ratings, sep='::', header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(path_movies, sep='::', header=None, names=mnames)


In [79]:
#users['user_id'] = users['user_id'].astype(str).astype(int)
#users['user_id'] = users['user_id'].astype(int)
#users.astype({'user_id': 'int32'}).dtypes

In [80]:
# Checks
print('\n','---users---')
print(users.dtypes)
print('\n','---ratings---')
print(ratings.dtypes)
print('\n','---movies---')
print(movies.dtypes)



 ---users---
user_id        int64
gender        object
age            int64
occupation     int64
zip           object
dtype: object

 ---ratings---
user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object

 ---movies---
movie_id     int64
title       object
genres      object
dtype: object


In [81]:
# Merge the 3 sets, first ratings with users, and then the result with movies
data = pd.merge(pd.merge(ratings, users), movies)
data.dtypes
#data.describe #takes a long time to run

user_id        int64
movie_id       int64
rating         int64
timestamp      int64
gender        object
age            int64
occupation     int64
zip           object
title         object
genres        object
dtype: object

In [82]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


## Explore the data

In [110]:
# Explore the mean movie rating, grouped by gender
# 'data' is a pd object that has several methods attached to it
# 'rating' is a column in 'data', not to be confused with the seperate frame 'ratings'

mean_ratings = pd.pivot_table(data, values=['rating'], index=['title'], columns='gender', aggfunc={'rating': np.mean})
mean_ratings 

#pd.pivot_table() is different from what works in the book 

Unnamed: 0_level_0,rating,rating
gender,F,M
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


In [84]:
dir(data)
#help(data.groupby)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 

In [93]:
# Explore 'ratings' by 'title'
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10] 

title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

In [94]:
# Then filter down to movies that received at least 250 ratings, and are considered 'active'
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

In [111]:
# Find mean ratings
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings

Unnamed: 0_level_0,rating,rating
gender,F,M
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.500000
101 Dalmatians (1996),3.240000,2.911215
12 Angry Men (1957),4.184397,4.328421
...,...,...
Young Guns (1988),3.371795,3.425620
Young Guns II (1990),2.934783,2.904025
Young Sherlock Holmes (1985),3.514706,3.363344
Zero Effect (1998),3.864407,3.723140


In [117]:
# Top films among femals
top_female_ratings = mean_ratings.sort_index(axis=1, ascending=False)
top_female_ratings

Unnamed: 0_level_0,rating,rating
gender,M,F
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"'burbs, The (1989)",2.962085,2.793478
10 Things I Hate About You (1999),3.311966,3.646552
101 Dalmatians (1961),3.500000,3.791444
101 Dalmatians (1996),2.911215,3.240000
12 Angry Men (1957),4.328421,4.184397
...,...,...
Young Guns (1988),3.425620,3.371795
Young Guns II (1990),2.904025,2.934783
Young Sherlock Holmes (1985),3.363344,3.514706
Zero Effect (1998),3.723140,3.864407
