## Import important libraries, movies and their ratings.

In [28]:
#import library 
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

#read data set using tab separator
movie_df = pd.read_csv('https://storage.googleapis.com/dqlab-dataset/title.basics.tsv', sep = '\t') #untuk menyimpan title_basics.tsv
rating_df = pd.read_csv('https://storage.googleapis.com/dqlab-dataset/title.ratings.tsv', sep = '\t') #untuk menyimpan title.ratings.tsv

#first 5 movie and rating data
print(movie_df.head())
print(rating_df.head())


      tconst  titleType                                      primaryTitle  \
0  tt0221078      short                         Circle Dance, Ute Indians   
1  tt8862466  tvEpisode  ¡El #TeamOsos va con todo al "Reality del amor"!   
2  tt7157720  tvEpisode                                     Episode #3.41   
3  tt2974998  tvEpisode                         Episode dated 16 May 1987   
4  tt2903620  tvEpisode                  Frances Bavier: Aunt Bee Retires   

                                      originalTitle  isAdult startYear  \
0                         Circle Dance, Ute Indians        0      1898   
1  ¡El #TeamOsos va con todo al "Reality del amor"!        0      2018   
2                                     Episode #3.41        0      2016   
3                         Episode dated 16 May 1987        0      1987   
4                  Frances Bavier: Aunt Bee Retires        0      1973   

  endYear runtimeMinutes             genres  
0      \N             \N  Documentary,Short  


## Understanding and Cleaning Data

In [29]:
## Understanding data each columns using info func 
print(movie_df.info())
print(rating_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          9025 non-null   object
 1   titleType       9025 non-null   object
 2   primaryTitle    9011 non-null   object
 3   originalTitle   9011 non-null   object
 4   isAdult         9025 non-null   int64 
 5   startYear       9025 non-null   object
 6   endYear         9025 non-null   object
 7   runtimeMinutes  9025 non-null   object
 8   genres          9014 non-null   object
dtypes: int64(1), object(8)
memory usage: 634.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030009 entries, 0 to 1030008
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1030009 non-null  object 
 1   averageRating  1030009 non-null  float64
 2   numVotes       1030009 non-null  int64  
dtypes: flo

In [30]:
##checking total missing values on each columns at movies data

print(movie_df.isnull().sum())

tconst             0
titleType          0
primaryTitle      14
originalTitle     14
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            11
dtype: int64


In [31]:
##Seems like there are 2 title columns that contain missing value, so we decided to fill new movie_df that contains no missing value at primaryTitle and originalTitle

movie_df = movie_df.loc[(movie_df['primaryTitle'].notnull() & movie_df['originalTitle'].notnull())]

print("Total data for the new updated dataset: ", len(movie_df))

Total data for the new updated dataset:  9011


In [32]:
##Also there is 1 columns ('genres') that have 11 missing values. We need to check that data first and decide how to handle missing values

print(movie_df.loc[movie_df['genres'].isnull()])

          tconst  titleType  \
9014  tt10233364  tvEpisode   
9015  tt10925142  tvEpisode   
9016  tt10970874  tvEpisode   
9017  tt11670006  tvEpisode   
9018  tt11868642  tvEpisode   
9019   tt2347742  tvEpisode   
9020   tt3984412  tvEpisode   
9021   tt8740950  tvEpisode   
9022   tt9822816  tvEpisode   
9023   tt9900062  tvEpisode   
9024   tt9909210  tvEpisode   

                                           primaryTitle originalTitle  \
9014  Rolling in the Deep Dish\tRolling in the Deep ...             0   
9015  The IMDb Show on Location: Star Wars Galaxy's ...             0   
9016  Die Bauhaus-Stadt Tel Aviv - Vorbild für die M...             0   
9017  ...ein angenehmer Unbequemer...\t...ein angene...             0   
9018  GGN Heavyweight Championship Lungs With Mike T...             0   
9019  No sufras por la alergia esta primavera\tNo su...             0   
9020  I'm Not Going to Come Last, I'm Just Going to ...             0   
9021  Weight Loss Resolution Restart - Ins 

In [33]:
##we decide to update movie dataset by removing those missing genres value

movie_df = movie_df.loc[(movie_df['genres'].notnull())]

print('Total data for the new updated dataset: ', len(movie_df))

Total data for the new updated dataset:  9000


In [34]:
##we found that theres '//N' value at some columns like startYear, endYear and runtimeMinutes. we will replace that value to np.nan to make it easier for the next step

movie_df['startYear'] = movie_df['startYear'].replace('\\N', np.nan)
movie_df['startYear'] = movie_df['startYear'].astype('float64')
print(movie_df['startYear'].unique()[:5])

#mengubah nilai '\\N' pada endYear menjadi np.nan dan cast kolomnya menjadi float64
movie_df['endYear'] = movie_df['endYear'].replace('\\N', np.nan)
movie_df['endYear'] = movie_df['endYear'].astype('float64')
print(movie_df['endYear'].unique()[:5])

#mengubah nilai '\\N' pada runtimeMinutes menjadi np.nan dan cast kolomnya menjadi float64
movie_df['runtimeMinutes'] = movie_df['runtimeMinutes'].replace('\\N', np.nan)
movie_df['runtimeMinutes'] = movie_df['runtimeMinutes'].astype('float64')
print(movie_df['runtimeMinutes'].unique()[:5])

[1898. 2018. 2016. 1987. 1973.]
[  nan 2005. 1955. 2006. 1999.]
[nan 29.  7. 23. 85.]


In [35]:
## we found that at genres data theres multiple values in 1 data and got separate by ','. so we will transform those data to list to make it easier for the modelling phase

print(movie_df['genres'].head())


0    Documentary,Short
1         Comedy,Drama
2     Comedy,Game-Show
3                 News
4          Documentary
Name: genres, dtype: object


In [36]:
## transforming to list
def transform_to_list(x):
    if ',' in x: 
    #ubah menjadi list apabila ada data pada kolom genre
        return x.split(',')
    else: 
    #jika tidak ada data, ubah menjadi list kosong
        return []

movie_df['genres'] = movie_df['genres'].apply(lambda x: transform_to_list(x))
print(movie_df['genres'].head())

0    [Documentary, Short]
1         [Comedy, Drama]
2     [Comedy, Game-Show]
3                      []
4                      []
Name: genres, dtype: object


In [42]:
## now we will handle missing values from rating dataset

print(rating_df.head(10))
print(rating_df.info())

      tconst  averageRating  numVotes
0  tt0000001            5.6      1608
1  tt0000002            6.0       197
2  tt0000003            6.5      1285
3  tt0000004            6.1       121
4  tt0000005            6.1      2050
5  tt0000006            5.1       111
6  tt0000007            5.4       639
7  tt0000008            5.4      1760
8  tt0000009            5.8       136
9  tt0000010            6.9      5778
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030009 entries, 0 to 1030008
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1030009 non-null  object 
 1   averageRating  1030009 non-null  float64
 2   numVotes       1030009 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ MB
None


In [43]:
#as the 2 dataset have been cleaned, then we will try to merge those data by inner join method

movie_rating_df = pd.merge(movie_df, rating_df, on='tconst', how='inner')
print(movie_rating_df.head())


      tconst  titleType              primaryTitle             originalTitle  \
0  tt0043745      short                 Lion Down                 Lion Down   
1  tt0167491      video         Wicked Covergirls         Wicked Covergirls   
2  tt6574096  tvEpisode      Shadow Play - Part 2      Shadow Play - Part 2   
3  tt6941700  tvEpisode              RuPaul Roast              RuPaul Roast   
4  tt7305674      video  UCLA Track & Field Promo  UCLA Track & Field Promo   

   isAdult  startYear  endYear  runtimeMinutes  \
0        0     1951.0      NaN             7.0   
1        1     1998.0      NaN            85.0   
2        0     2017.0      NaN            22.0   
3        0     2017.0      NaN             NaN   
4        0     2017.0      NaN             NaN   

                           genres  averageRating  numVotes  
0     [Animation, Comedy, Family]            7.1       459  
1                              []            5.7         7  
2  [Adventure, Animation, Comedy]        

In [44]:

#we still see some NA values aat 2 columns, we will get rid of it
movie_rating_df = movie_rating_df.dropna(subset=['startYear', 'runtimeMinutes'])

#we will check the data to make sure no NAN values.
print(movie_rating_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1004 entries, 0 to 1374
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          1004 non-null   object 
 1   titleType       1004 non-null   object 
 2   primaryTitle    1004 non-null   object 
 3   originalTitle   1004 non-null   object 
 4   isAdult         1004 non-null   int64  
 5   startYear       1004 non-null   float64
 6   endYear         17 non-null     float64
 7   runtimeMinutes  1004 non-null   float64
 8   genres          1004 non-null   object 
 9   averageRating   1004 non-null   float64
 10  numVotes        1004 non-null   int64  
dtypes: float64(4), int64(2), object(5)
memory usage: 94.1+ KB
None


##Building simple recommender system using weighted average algorithm

In [None]:
#code the algorithm
def imdb_weighted_rating(df, var=0.8):
    v = df[___]
    R = df[___]
    C = df[___].mean()
    m = df[___].quantile(var)
    df['score'] = ___ #Rumus IMDb 
    return df['score']
    
imdb_weighted_rating(movie_rating_df)

#melakukan pengecekan dataframe
print(movie_rating_df.head())