## WEEK 05
## Encounter - Data Modeling
## 'movielens' data profiling

### 1. Tags

In [1]:
import pandas as pd

In [2]:
tags = pd.read_csv('../data/tags.csv')
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3675,606,7382,for katie,1171234019
3676,606,7936,austere,1173392334
3677,610,3265,gun fu,1493843984
3678,610,3265,heroic bloodshed,1493843978


In [3]:
tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [4]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3680 non-null   int64 
 1   movieId    3680 non-null   int64 
 2   tag        3680 non-null   object
 3   timestamp  3680 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.1+ KB


In [5]:
tags_dups = tags.groupby(by=['userId', 'movieId','tag']).count()
tags_dups

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,timestamp
userId,movieId,tag,Unnamed: 3_level_1
2,60756,Highly quotable,1
2,60756,funny,1
2,60756,will ferrell,1
2,89774,Boxing story,1
2,89774,MMA,1
...,...,...,...
606,7382,for katie,1
606,7936,austere,1
610,3265,gun fu,1
610,3265,heroic bloodshed,1


In [6]:
# test if ('userId', 'movieId', 'tag') is PK for 'Tags' dataset
# (defines a record uniquly)
tags_dups['timestamp'].nunique()

# Answer is 'YES' as we have only 1 record (count()) for each combination under groupby()

1

In [7]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [8]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,3680.0,3680.0,3680.0
mean,431.038587,27146.562772,1319865000.0
std,158.489644,43351.076604,172072700.0
min,2.0,1.0,1137179000.0
25%,424.0,1261.5,1137521000.0
50%,474.0,4451.0,1269833000.0
75%,477.0,38973.0,1498457000.0
max,610.0,193565.0,1537099000.0


In [9]:
tag_len = tags['tag'].str.len()
tag_len.max()

85

In [10]:
# checking for alternate PK
tags_pk = tags.groupby(by=['userId', 'movieId','timestamp']).count()
tags_pk

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tag
userId,movieId,timestamp,Unnamed: 3_level_1
2,60756,1445714992,1
2,60756,1445714994,1
2,60756,1445714996,1
2,89774,1445715200,1
2,89774,1445715205,1
...,...,...,...
606,7382,1171234019,1
606,7936,1173392334,1
610,3265,1493843978,1
610,3265,1493843984,1


In [11]:
tags_pk['tag'].nunique()

5

In [12]:
tags_pk['tag'].unique()
# it means that combination ('userId', 'movieId','timestamp') could n ot be used as a PK

array([1, 2, 3, 4, 5], dtype=int64)

### 2. Links

In [13]:
links = pd.read_csv('../data/links.csv')
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862
...,...,...,...
9717,193581,5476944,432131
9718,193583,5914996,445030
9719,193585,6397426,479308
9720,193587,8391976,483455


In [14]:
# checking for NULLs
links.isnull().sum()
# No NULLs

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [15]:
# data types for columns
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9722 entries, 0 to 9721
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   movieId  9722 non-null   int64
 1   imdbId   9722 non-null   int64
 2   tmdbId   9722 non-null   int64
dtypes: int64(3)
memory usage: 228.0 KB


In [16]:
# min and max values
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,9722.0,9722.0,9722.0
mean,42069.219914,674140.2,54769.048035
std,52037.308455,1101341.0,92989.237112
min,1.0,417.0,2.0
25%,3246.25,95175.25,9662.25
50%,7283.5,166951.5,16463.5
75%,76048.0,804119.8,44036.5
max,193609.0,8391976.0,525662.0


In [17]:
# checking if 'movie_id' is a PK
links['movieId'].nunique()
# the same as total -> is a PK

9722

In [18]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [19]:
# checking if 'imdbId' is unique
links['imdbId'].nunique()
# YES

9722

In [20]:
# checking if 'tmdbId' is unique
links['tmdbId'].nunique()
# NO!!


9721

In [21]:
# tmdbId dup:
links_dups = links.groupby(by='tmdbId').count()
mask = links_dups['movieId'] > 1
links_dups[mask]

Unnamed: 0_level_0,movieId,imdbId
tmdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
4912,2,2


### 3. Movies

In [22]:
movies = pd.read_csv('../data/movies.csv')
movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
9724,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
9725,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
9726,193585,Flint,Drama,2017
9727,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018


In [23]:
# checking for NULLs
movies.isnull().sum()
# no NULLs

movieId    0
title      0
genres     0
year       0
dtype: int64

In [40]:
# checking data types
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9729 entries, 0 to 9728
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9729 non-null   int64 
 1   title    9729 non-null   object
 2   genres   9729 non-null   object
 3   year     9729 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 304.2+ KB


In [25]:
# getting min and max numeric values
movies.describe()
#movies.describe(include='O')

Unnamed: 0,movieId,year
count,9729.0,9729.0
mean,42054.09004,1994.613629
std,52026.56424,18.535219
min,1.0,1902.0
25%,3244.0,1988.0
50%,7282.0,1999.0
75%,76030.0,2008.0
max,193609.0,2018.0


In [26]:
movies.columns

Index(['movieId', 'title', 'genres', 'year'], dtype='object')

In [27]:
# max length of 'title' values
title_len = movies['title'].str.len()
title_len.max()

152

In [28]:
# max length of 'title' values
genres_len = movies['genres'].str.len()
genres_len.max()

77

In [29]:
# getting PK (movieId)
movies['movieId'].nunique()
# the same as total -> 'movieId' is a PK!

9729

In [30]:
# checking if 'title' is also unique
movies['title'].nunique()
# not at all

9448

In [31]:
# subset of films with the same titles
movies_dups = movies.groupby(by='title').count()
#movies_dups
mask_movies = movies_dups['movieId'] > 1
movies_dups[mask_movies][['movieId']]

Unnamed: 0_level_0,movieId
title,Unnamed: 1_level_1
12 Angry Men,2
12 Chairs,2
"20,000 Leagues Under the Sea",2
3:10 to Yuma,2
Alfie,2
...,...
Wonderland,2
Wuthering Heights,2
"Yours, Mine and Ours",2
Zoom,2


In [32]:
# unique genres?
# TBD

### 4. Ratings

In [33]:
ratings = pd.read_csv('../data/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100813,610,166534,4.0,1493848402
100814,610,168248,5.0,1493850091
100815,610,168250,5.0,1494273047
100816,610,168252,5.0,1493846352


In [34]:
# checking for NULLs
ratings.isnull().sum()
# no NULLs

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [35]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100818 entries, 0 to 100817
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100818 non-null  int64  
 1   movieId    100818 non-null  int64  
 2   rating     100818 non-null  float64
 3   timestamp  100818 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [36]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100818.0,100818.0,100818.0,100818.0
mean,326.128717,19412.910552,3.501532,1205892000.0
std,182.62011,35490.816132,1.042469,216241400.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1018665000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8044.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [37]:
# how many decimals has float 'rating'
ratings['rating_str'] = ratings['rating'].astype(object)
ratings

Unnamed: 0,userId,movieId,rating,timestamp,rating_str
0,1,1,4.0,964982703,4.0
1,1,3,4.0,964981247,4.0
2,1,6,4.0,964982224,4.0
3,1,47,5.0,964983815,5.0
4,1,50,5.0,964982931,5.0
...,...,...,...,...,...
100813,610,166534,4.0,1493848402,4.0
100814,610,168248,5.0,1493850091,5.0
100815,610,168250,5.0,1494273047,5.0
100816,610,168252,5.0,1493846352,5.0


In [38]:
# 
ratings.info()

#ratings_len = ratings['rating_str'].str.len()
#ratings_len.max()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100818 entries, 0 to 100817
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   userId      100818 non-null  int64  
 1   movieId     100818 non-null  int64  
 2   rating      100818 non-null  float64
 3   timestamp   100818 non-null  int64  
 4   rating_str  100818 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 3.8+ MB


In [54]:
# Primary Key for 'ratings'
# are there any users who gave a few ratings for the same movie? NO
# userId+movieId are suuposed to be PK
# TRUE
ratings_pk = ratings.groupby(by=['userId', 'movieId']).count()
ratings_pk

ratings_mask = ratings_pk['rating'] > 1
ratings_pk[ratings_mask][['rating']]

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1


In [51]:
# list of unique values gives us NUMBER OF DECIMALS!
ratings['rating_str'].unique()

array([4.0, 5.0, 3.0, 2.0, 1.0, 4.5, 3.5, 2.5, 0.5, 1.5], dtype=object)

In [39]:
# gives error
ratings_len = ratings['rating_str'].str.len()
ratings_len.max()

AttributeError: Can only use .str accessor with string values!

Links:
    movieId,imdbId,tmdbId
    1,114709,862

Movies:
    movieId,title,genres,year
    1,Toy Story ,Adventure|Animation|Children|Comedy|Fantasy,1995

Ratings:
    userId,movieId,rating,timestamp
    1,1,4.0,964982703