# Movie data set

data source: https://grouplens.org/datasets/movielens/

In [1]:
import pandas as pd
import numpy as np

# Users table

In [2]:
# The csv does not have the header line, but README has this information

cols = ['user_id', 'age', 'gender', 'occupation',  'zip_code' ]

users = pd.read_csv('ml-100k/u.user', sep='|', names=cols)
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


# Rating table

In [3]:
cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=cols)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## converting unix time stamp

In [4]:
import datetime
datetime.datetime.fromtimestamp(881250949)

datetime.datetime(1997, 12, 4, 7, 55, 49)

In [5]:
ratings.unix_timestamp.apply(datetime.datetime.fromtimestamp)

0       1997-12-04 07:55:49
1       1998-04-04 12:22:22
2       1997-11-06 23:18:36
3       1997-11-26 21:02:03
4       1998-02-01 21:33:16
5       1998-01-07 06:20:06
6       1997-12-03 09:51:28
7       1998-04-03 11:34:27
8       1998-02-01 01:20:17
9       1997-12-31 13:16:53
10      1997-11-12 14:07:14
11      1997-11-17 07:38:45
12      1997-10-05 02:05:40
13      1998-03-27 14:59:54
14      1998-02-21 15:40:57
15      1997-11-13 21:28:38
16      1997-11-11 09:47:39
17      1997-11-14 12:36:34
18      1997-09-21 02:42:24
19      1998-04-08 16:47:17
20      1998-01-30 08:13:34
21      1998-04-16 07:54:12
22      1997-10-26 08:55:20
23      1997-09-21 02:24:38
24      1998-02-17 09:28:52
25      1997-11-10 13:16:06
26      1998-04-12 18:14:54
27      1998-01-02 05:40:50
28      1997-10-01 16:10:01
29      1997-10-14 13:33:05
                ...        
99970   1997-11-19 09:12:53
99971   1997-10-05 00:38:41
99972   1997-10-17 18:17:27
99973   1997-09-20 15:01:29
99974   1997-10-07 0

# Movie table

In [6]:
cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', usecols=range(5), names=cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [7]:
movies.tail()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998)
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 5 columns):
movie_id              1682 non-null int64
title                 1682 non-null object
release_date          1681 non-null object
video_release_date    0 non-null float64
imdb_url              1679 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 65.8+ KB


In [9]:
movies.dtypes

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object

In [10]:
users.describe()

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


# selecting

In [11]:
users.columns

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')

In [12]:
mycols = ['user_id', 'occupation', 'gender']
users[mycols].head()

Unnamed: 0,user_id,occupation,gender
0,1,technician,M
1,2,other,F
2,3,writer,M
3,4,technician,M
4,5,other,F


Users older than 25

In [13]:
users[ users.age > 25].shape

(671, 5)

In [14]:
users[ users.age > 25].count()

user_id       671
age           671
gender        671
occupation    671
zip_code      671
dtype: int64

Male users older than 25

In [15]:
users[  (users.gender == 'M') & (users.age > 25) ].count()

user_id       474
age           474
gender        474
occupation    474
zip_code      474
dtype: int64

# Merging

In [16]:
movies.shape

(1682, 5)

In [17]:
ratings.shape

(100000, 4)

In [18]:
movie_ratings = pd.merge(movies, ratings)

In [19]:
movie_ratings.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,user_id,rating,unix_timestamp
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,308,4,887736532
1,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,287,5,875334088
2,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,148,4,877019411
3,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,280,4,891700426
4,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,66,3,883601324


In [20]:
movie_ratings.shape

(100000, 8)

In [21]:
lens = pd.merge(movie_ratings, users)

In [22]:
lens.shape

(100000, 12)

In [23]:
lens.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,user_id,rating,unix_timestamp,age,gender,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,308,4,887738847,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),308,5,887736696,60,M,retired,95076


In [24]:
lens.groupby('title').size()

title
'Til There Was You (1997)                                     9
1-900 (1994)                                                  5
101 Dalmatians (1996)                                       109
12 Angry Men (1957)                                         125
187 (1997)                                                   41
2 Days in the Valley (1996)                                  93
20,000 Leagues Under the Sea (1954)                          72
2001: A Space Odyssey (1968)                                259
3 Ninjas: High Noon At Mega Mountain (1998)                   5
39 Steps, The (1935)                                         59
8 1/2 (1963)                                                 38
8 Heads in a Duffel Bag (1997)                                4
8 Seconds (1994)                                              4
A Chef in Love (1996)                                         8
Above the Rim (1994)                                          5
Absolute Power (1997)             

In [25]:
# for example - the group size of "Zeus and Roxanne (1997)" is 6

lens[ lens.title == 'Zeus and Roxanne (1997)']

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,user_id,rating,unix_timestamp,age,gender,occupation,zip_code
1553,1164,Zeus and Roxanne (1997),10-Jan-1997,,http://us.imdb.com/M/title-exact?Zeus%20and%20...,181,3,878962464,26,M,executive,21218
26511,1164,Zeus and Roxanne (1997),10-Jan-1997,,http://us.imdb.com/M/title-exact?Zeus%20and%20...,82,2,878768790,50,M,programmer,22902
35030,1164,Zeus and Roxanne (1997),10-Jan-1997,,http://us.imdb.com/M/title-exact?Zeus%20and%20...,798,3,875637744,40,F,writer,64131
47011,1164,Zeus and Roxanne (1997),10-Jan-1997,,http://us.imdb.com/M/title-exact?Zeus%20and%20...,792,3,877910629,40,M,programmer,12205
49321,1164,Zeus and Roxanne (1997),10-Jan-1997,,http://us.imdb.com/M/title-exact?Zeus%20and%20...,463,1,877385797,48,F,healthcare,75218
57371,1164,Zeus and Roxanne (1997),10-Jan-1997,,http://us.imdb.com/M/title-exact?Zeus%20and%20...,881,1,876537106,39,M,marketing,43017


In [26]:
# The top 10

lens.groupby('title').size().sort_values(ascending=False)[:10]

title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
dtype: int64

In [27]:
# another way

lens.title.value_counts()[:10]

Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
Name: title, dtype: int64

In [28]:
movie_stats = lens.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.600000
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344000
187 (1997),41,3.024390
2 Days in the Valley (1996),93,3.225806
"20,000 Leagues Under the Sea (1954)",72,3.500000
2001: A Space Odyssey (1968),259,3.969112
3 Ninjas: High Noon At Mega Mountain (1998),5,1.000000
"39 Steps, The (1935)",59,4.050847


In [29]:
movie_stats.sort_values([('rating', 'mean')], ascending=False).head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
They Made Me a Criminal (1939),1,5.0
Marlene Dietrich: Shadow and Light (1996),1,5.0
"Saint of Fort Washington, The (1993)",2,5.0
Someone Else's America (1995),1,5.0
Star Kid (1997),3,5.0
