# import the necessary libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
encoding = 'latin1'

# Updating the columns for user data

In [10]:
userDataPath = os.path.expanduser('MovielensData/users.dat')
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv(userDataPath, sep='::', header=None, names=unames, encoding=encoding)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
users.columns

Index(['user_id', 'gender', 'age', 'occupation', 'zip'], dtype='object')

In [12]:
users.describe()

Unnamed: 0,user_id,age,occupation
count,6040.0,6040.0,6040.0
mean,3020.5,30.639238,8.146854
std,1743.742145,12.895962,6.329511
min,1.0,1.0,0.0
25%,1510.75,25.0,3.0
50%,3020.5,25.0,7.0
75%,4530.25,35.0,14.0
max,6040.0,56.0,20.0


# Ratings Movie data set

In [23]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
user_id       6040 non-null int64
gender        6040 non-null object
age           6040 non-null int64
occupation    6040 non-null int64
zip           6040 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB


In [19]:
ratingDataPath = os.path.expanduser('MovielensData/ratings.dat')
rnames = ['user_id','movie_id', 'Rating', 'timestamp']
ratings = pd.read_csv(ratingDataPath, sep='::', header=None, names=rnames, encoding=encoding)

  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
ratings.columns

Index(['user_id', 'movie_id', 'Rating', 'timestamp'], dtype='object')

In [21]:
ratings.describe()

Unnamed: 0,user_id,movie_id,Rating,timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
user_id      1000209 non-null int64
movie_id     1000209 non-null int64
Rating       1000209 non-null int64
timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


# Movie Data Set

In [24]:
movieDataPath = os.path.expanduser('MovielensData/movies.dat')
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv(movieDataPath, sep='::', header=None, names=mnames, encoding=encoding)

  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
movie_id    3883 non-null int64
title       3883 non-null object
genres      3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [26]:
movies.describe()

Unnamed: 0,movie_id
count,3883.0
mean,1986.049446
std,1146.778349
min,1.0
25%,982.5
50%,2010.0
75%,2980.5
max,3952.0


In [31]:
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [28]:
ratings.head()

Unnamed: 0,user_id,movie_id,Rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [44]:
print(users[:5])
print(movies[:5])
print(ratings[:5])

   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455
   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
   user_id  movie_id  Rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291


# Combining the data

In [46]:
URData =  pd.merge(users, ratings, on ='user_id')

In [47]:
print(URData[:5])

   user_id gender  age  occupation    zip  movie_id  Rating  timestamp
0        1      F    1          10  48067      1193       5  978300760
1        1      F    1          10  48067       661       3  978302109
2        1      F    1          10  48067       914       3  978301968
3        1      F    1          10  48067      3408       4  978300275
4        1      F    1          10  48067      2355       5  978824291


In [48]:
print(movies[:5])

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


In [49]:
masterData = pd.merge(URData, movies, on='movie_id')

In [50]:
masterData.head()

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,Rating,timestamp,title,genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


# Combining the data in one line

In [51]:
combineData = pd.merge(pd.merge(ratings, users, on='user_id'), movies, on='movie_id')

In [52]:
combineData.head()

Unnamed: 0,user_id,movie_id,Rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


# Explore the datasets using visual representations (graphs or tables), also include your comments on the following: 
# 1. User Age Distribution

In [62]:
combineData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
user_id       1000209 non-null int64
movie_id      1000209 non-null int64
Rating        1000209 non-null int64
timestamp     1000209 non-null int64
gender        1000209 non-null object
age           1000209 non-null int64
occupation    1000209 non-null int64
zip           1000209 non-null object
title         1000209 non-null object
genres        1000209 non-null object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [63]:
combineData.columns

Index(['user_id', 'movie_id', 'Rating', 'timestamp', 'gender', 'age',
       'occupation', 'zip', 'title', 'genres'],
      dtype='object')

In [56]:
ageDistribution = combineData.groupby('age')

In [60]:
ageDistribution.size()

age
1      27211
18    183536
25    395556
35    199003
45     83633
50     72490
56     38780
dtype: int64

In [76]:
# Find the ratings for all the movies reviewed by for a particular user of user id = 2696
combineData[combineData['user_id'] == 2696]['Rating']

24345     2
29848     3
244232    4
250014    5
273633    1
277808    4
371178    4
377250    3
598042    4
603189    4
609204    4
611956    2
612552    4
613486    4
616546    4
618708    4
621101    1
689379    4
697451    2
777089    3
Name: Rating, dtype: int64