# Movie Lens 
The data sets have 1m ratings based on 6000 users, on 4000 movies. 

## Prepare the data

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# define data paths
path_users = '../datasets/movielens/users.dat'
path_ratings = '../datasets/movielens/ratings.dat'
path_movies = '../datasets/movielens/movies.dat'


In [4]:
# Load the three tables into dataframes
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(path_users, sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(path_ratings, sep='::', header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(path_movies, sep='::', header=None, names=mnames)


In [111]:
#users['user_id'] = users['user_id'].astype(str).astype(int)
#users['user_id'] = users['user_id'].astype(int)
#users.astype({'user_id': 'int32'}).dtypes

In [5]:
# Checks
print('\n','---users---')
print(users.dtypes)
print('\n','---ratings---')
print(ratings.dtypes)
print('\n','---movies---')
print(movies.dtypes)



 ---users---
user_id        int64
gender        object
age            int64
occupation     int64
zip           object
dtype: object

 ---ratings---
user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object

 ---movies---
movie_id     int64
title       object
genres      object
dtype: object


In [6]:
# Merge the 3 sets, first ratings with users, and then the result with movies
data = pd.merge(pd.merge(ratings, users), movies)
data.dtypes
#data.describe #takes a long time to run

user_id        int64
movie_id       int64
rating         int64
timestamp      int64
gender        object
age            int64
occupation     int64
zip           object
title         object
genres        object
dtype: object

In [118]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


## Explore the data

In [36]:
# Explore the mean movie rating, grouped by gender
# 'data' is a pd object that has several methods attached to it
# 'rating' is a column in 'data', not to be confused with the seperate frame 'ratings'

mean_rating = pd.pivot_table(data, values=['rating'], index=['gender', 'title'], aggfunc={'rating': np.mean})
mean_rating 

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
gender,title,Unnamed: 2_level_1
F,"$1,000,000 Duck (1971)",3.375000
F,'Night Mother (1986),3.388889
F,'Til There Was You (1997),2.675676
F,"'burbs, The (1989)",2.793478
F,...And Justice for All (1979),3.828571
...,...,...
M,"Zed & Two Noughts, A (1985)",3.380952
M,Zero Effect (1998),3.723140
M,Zero Kelvin (Kj�rlighetens kj�tere) (1995),3.500000
M,Zeus and Roxanne (1997),2.357143


In [None]:
# Explore 'ratings' by 'title' --be patient--
ratings_by_title = data.groupby('title')
ratings_by_title[:10] 

In [None]:
# Then filter down to movies that received at least 250 ratings, and are considered 'active'
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
