In [2]:
import numpy as np
import pandas as pd

<hr>

# Soal 2 - Film Bagus 🎥

In [3]:
# Dataset

dfMovies = pd.read_csv('movies.csv')
dfMovies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [4]:
dfRatings = pd.read_csv('ratings.csv')
dfRatings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


<hr>

## Content-based filtering

In [5]:
# Preparing Dataset
dfMovies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
dfMovies[dfMovies['genres']== "(no genres listed)" ]

# Notes : diasumsikan film yang tidak memiliki genres tidak dimasukkan dalam sistem rekomendasi content based filtering
# Agar tidak mempengaruhi ketika perhitungan CountVectorizer

Unnamed: 0,movieId,title,genres
10172,126929,Li'l Quinquin ( ),(no genres listed)
10260,135460,Pablo (2012),(no genres listed)
10280,138863,The Big Broadcast of 1936 (1935),(no genres listed)
10301,141305,Round Trip to Heaven (1992),(no genres listed)
10303,141472,The 50 Year Argument (2014),(no genres listed)
10317,143709,The Take (2009),(no genres listed)
10328,149532,Marco Polo: One Hundred Eyes (2015),(no genres listed)


In [7]:
dfMovies_n = dfMovies.copy()

for i in range(len(dfMovies)) :
    dfMovies_n['genres'][i]=dfMovies['genres'][i].replace("|"," ")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfMovies_n['genres'][i]=dfMovies['genres'][i].replace("|"," ")


In [10]:
x = dfMovies_n[dfMovies_n['genres'] == "(no genres listed)" ].index
dfMovies_n = dfMovies_n.drop(x)
dfMovies_n[dfMovies_n['genres'] == "(no genres listed)" ]

Unnamed: 0,movieId,title,genres


In [11]:
# Count Vectorizer and Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [12]:
cm = cv.fit_transform(dfMovies_n['genres'])
print(cv.get_feature_names())
print(cm.toarray())

['action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi', 'film', 'horror', 'imax', 'musical', 'mystery', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']
[[0 1 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
cosScore = cosine_similarity(cm.toarray())
cosScore

array([[1.        , 0.77459667, 0.31622777, ..., 0.4472136 , 0.4472136 ,
        0.        ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.70710678, 0.70710678,
        0.        ],
       ...,
       [0.4472136 , 0.        , 0.70710678, ..., 1.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

<hr>

### Rekomendasi untuk Joko : Superman vs. The Elite (2012).

In [14]:
indexjoko = dfMovies_n[dfMovies_n['title'] == 'Superman vs. The Elite (2012)'].index.values[0]

In [15]:
similarmovies =  sorted(list(enumerate(cosScore[indexjoko])), key=lambda x: x[1], reverse=True)
similarmovies[:6]

[(6260, 0.9999999999999998),
 (8637, 0.9999999999999998),
 (9370, 0.9999999999999998),
 (9570, 0.9999999999999998),
 (10167, 0.9999999999999998),
 (10275, 0.9999999999999998)]

In [16]:
j = 0
print('Rekomendasi film untuk Joko : ')
for i in similarmovies:
    if j < 5 and i[0] != indexjoko:
        print('     - ' + str(dfMovies_n['title'].iloc[i[0]]))
        j += 1
    elif j >= 5 :
        break
    else :
        pass

Rekomendasi film untuk Joko : 
     - Street Fighter II: The Animated Movie (Sutorîto Faitâ II gekijô-ban) (1994)
     - Batman: Under the Red Hood (2010)
     - Batman: The Dark Knight Returns, Part 2 (2013)
     - Justice League: Throne of Atlantis (2015)
     - Justice League: Gods and Monsters (2015)


<hr>

## Collaborative Filtering

In [17]:
# Preparing Dataset

dfRatings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [18]:
dfMR = dfRatings.merge(dfMovies, left_on='movieId', right_on='movieId')
dfMR.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama


In [19]:
dfRating_n = dfRatings.pivot(index = 'userId', columns='movieId', values='rating')
dfRating_n = dfRating_n.replace([np.NaN,-1],0)
dfRating_n.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
dfRatingcorr = dfRating_n.corr()

<hr>

### Rekomendasi untuk Widodo : Being Flynn (2012)

In [21]:
moviewidodo = dfMR[dfMR['title'] =='Being Flynn (2012)']['movieId'].values[0]
moviewidodo

95816

In [22]:
# rating maksimum 5 : diasumsikan rating film Being flynn Widodod : 5
rw = 5

In [23]:
dfSkorWidodo = pd.DataFrame()
skor = dfRatingcorr.loc[moviewidodo] * rw
    
sortskor = skor.sort_values(ascending=False)
sortskor

movieId
7086     5.000000
4801     5.000000
6898     5.000000
87660    5.000000
53038    5.000000
           ...   
1080    -0.079633
1073    -0.094189
231     -0.094907
367     -0.096955
344     -0.103537
Name: 95816, Length: 10325, dtype: float64

In [24]:
print('Rekomendasi film untuk Widodo : ')
j = 0
for i in sortskor.index :
    if j < 5 and i != moviewidodo:
        print('     - ' + str(dfMovies[dfMovies['movieId']==i]['title'].values[0]))
        j += 1
    elif j >= 5 :
        break
    else :
        pass

Rekomendasi film untuk Widodo : 
     - Pygmalion (1938)
     - Little Foxes, The (1941)
     - Sweet Sixteen (2002)
     - Too Big to Fail (2011)
     - Red Dust (1932)
