# Movie Analytics

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np


## 2. Reading and Exploring Data

### movies 

In [2]:
# movies name
mnames = ['movie_id', 'title', 'genres']
movies_data = pd.read_table('movies.dat', sep = '::', header = None,  names = mnames, engine = 'python', encoding = 'latin-1')
print('movie shape: ' ,movies_data.shape)
movies_data.info()

movie shape:  (3883, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [3]:
# display
movies_data.head(10)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
# check unique movies
unique_movies = movies_data['movie_id'].unique().tolist()
len(unique_movies)

3883

### Ratings

In [5]:
# ratings name
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_data = pd.read_table('ratings.dat', sep = '::', header = None,  names = rnames, engine = 'python', encoding = 'latin-1')
print('movie shape: ' ,ratings_data.shape)
ratings_data.info()

movie shape:  (1000209, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [6]:
# display
ratings_data.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [7]:
ratings_data.shape

(1000209, 4)

In [8]:
# statistic summary
ratings_data.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [9]:
# minimum rating
ratings_data['rating'].min()

1

In [10]:
# maksimum rating
ratings_data['rating'].max()

5

## 3. Data Cleaning

In [11]:
movies_data.isnull().any()

movie_id    False
title       False
genres      False
dtype: bool

In [12]:
movies_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3878    False
3879    False
3880    False
3881    False
3882    False
Length: 3883, dtype: bool

In [13]:
ratings_data.isnull().any()

user_id      False
movie_id     False
rating       False
timestamp    False
dtype: bool

## 4. Data Analytics

In [14]:
drama_movies = movies_data['genres'].str.contains('Drama')
print(drama_movies.shape)
drama_movies.info()

(3883,)
<class 'pandas.core.series.Series'>
RangeIndex: 3883 entries, 0 to 3882
Series name: genres
Non-Null Count  Dtype
--------------  -----
3883 non-null   bool 
dtypes: bool(1)
memory usage: 3.9 KB


In [16]:
movies_data[drama_movies]

Unnamed: 0,movie_id,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama
10,11,"American President, The (1995)",Comedy|Drama|Romance
13,14,Nixon (1995),Drama
15,16,Casino (1995),Drama|Thriller
16,17,Sense and Sensibility (1995),Drama|Romance
...,...,...,...
3876,3946,Get Carter (2000),Action|Drama|Thriller
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [17]:
# users, movies dan rating
# check shape movies
movies_data.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
# check ratings
ratings_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [20]:
# merge movies and rating dataframe
movie_ratings_data = movies_data.merge(ratings_data, on = 'movie_id', how = 'inner')
movie_ratings_data.head()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


In [21]:
# most rated movies (>< highly rated movies)
# film yang paling banyak di-rating oleh users

most_rated = movie_ratings_data.groupby('title').size()
most_rated.head(25)

title
$1,000,000 Duck (1971)                           37
'Night Mother (1986)                             70
'Til There Was You (1997)                        52
'burbs, The (1989)                              303
...And Justice for All (1979)                   199
1-900 (1994)                                      2
10 Things I Hate About You (1999)               700
101 Dalmatians (1961)                           565
101 Dalmatians (1996)                           364
12 Angry Men (1957)                             616
13th Warrior, The (1999)                        750
187 (1997)                                       55
2 Days in the Valley (1996)                     286
20 Dates (1998)                                 139
20,000 Leagues Under the Sea (1954)             575
200 Cigarettes (1999)                           181
2001: A Space Odyssey (1968)                   1716
2010 (1984)                                     470
24 7: Twenty Four Seven (1997)                    5
24-hou

In [22]:
most_rated = movie_ratings_data.title.value_counts(ascending=True)
most_rated

Open Season (1996)                                          1
Number Seventeen (1932)                                     1
Song of Freedom (1936)                                      1
Elstree Calling (1930)                                      1
Juno and Paycock (1930)                                     1
                                                         ... 
Jurassic Park (1993)                                     2672
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode IV - A New Hope (1977)                2991
American Beauty (1999)                                   3428
Name: title, Length: 3706, dtype: int64

In [24]:
# highly rated  movies
highly_rated = movie_ratings_data.groupby('title').agg({'rating':[np.size, np.mean]})
highly_rated

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",37,3.027027
'Night Mother (1986),70,3.371429
'Til There Was You (1997),52,2.692308
"'burbs, The (1989)",303,2.910891
...And Justice for All (1979),199,3.713568
...,...,...
"Zed & Two Noughts, A (1985)",29,3.413793
Zero Effect (1998),301,3.750831
Zero Kelvin (Kjærlighetens kjøtere) (1995),2,3.500000
Zeus and Roxanne (1997),23,2.521739


In [27]:
# highly rated  movies (sorted)
highly_rated_sorted = highly_rated.sort_values(('rating', 'size'), ascending=False)
highly_rated_sorted

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
American Beauty (1999),3428,4.317386
Star Wars: Episode IV - A New Hope (1977),2991,4.453694
Star Wars: Episode V - The Empire Strikes Back (1980),2990,4.292977
Star Wars: Episode VI - Return of the Jedi (1983),2883,4.022893
Jurassic Park (1993),2672,3.763847
...,...,...
Target (1995),1,4.000000
I Don't Want to Talk About It (De eso no se habla) (1993),1,4.000000
An Unforgettable Summer (1994),1,3.000000
Never Met Picasso (1996),1,2.000000


In [29]:
# highly rated  movies (sorted)
highly_rated_sorted2= highly_rated.sort_values(('rating', 'mean'), ascending=False)
highly_rated_sorted2

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Ulysses (Ulisse) (1954),1,5.0
Lured (1947),1,5.0
Follow the Bitch (1998),1,5.0
Bittersweet Motel (2000),1,5.0
Song of Freedom (1936),1,5.0
...,...,...
"Fantastic Night, The (La Nuit Fantastique) (1949)",1,1.0
Cheetah (1989),1,1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973),2,1.0
Mutters Courage (1995),1,1.0


In [30]:
# display movie rating 5
highly_rated_sorted3 = highly_rated_sorted.loc[highly_rated_sorted[('rating', 'mean')] == 5]
highly_rated_sorted3

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Gate of Heavenly Peace, The (1995)",3,5.0
Smashing Time (1967),2,5.0
Lured (1947),1,5.0
Song of Freedom (1936),1,5.0
Follow the Bitch (1998),1,5.0
One Little Indian (1973),1,5.0
"Baby, The (1973)",1,5.0
Schlafes Bruder (Brother of Sleep) (1995),1,5.0
Bittersweet Motel (2000),1,5.0
Ulysses (Ulisse) (1954),1,5.0
