Nama : Satria Baladewa Harahap

Using the MovieLens 100K data set from GroupLens.org

In [None]:
import pandas as pd
import numpy as np

r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")

m_cols = ['movie_id', 'title']
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(2), encoding="ISO-8859-1")

# Dataset disatukan sehingga Movie Titles dan Ratings tersedia dalam satu himpunan data yang komprehensif
ratings = pd.merge(movies, ratings)
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


# User-based collaborative filtering

In [None]:
# movieRatings adalah matriks rongga pengguna dan film yang mereka nilai - dengan NaN menunjukkan film yang tidak ditonton pengguna
movieRatings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
movieRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [None]:
# starWarsRatings adalah serangkaian pengguna yang menilai film 'Star Wars (1977)'
starWarsRatings = movieRatings['Star Wars (1977)']
starWarsRatings.head()

user_id
1    5.0
2    5.0
3    NaN
4    5.0
5    4.0
Name: Star Wars (1977), dtype: float64

In [None]:
# similarMovies memegang koefisien korelasi 'Star Wars (1977)' dengan setiap gerakan dalam himpunan data
similarMovies = movieRatings.corrwith(starWarsRatings).dropna()
similarMovies.head()

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


title
'Til There Was You (1997)    0.872872
1-900 (1994)                -0.645497
101 Dalmatians (1996)        0.211132
12 Angry Men (1957)          0.184289
187 (1997)                   0.027398
dtype: float64

In [None]:
# movieStats menghitung berapa banyak peringkat yang ada untuk setiap film, dan juga peringkat rata-rata untuk setiap film
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [None]:
# Untuk melawan kepalsuan yang diperkenalkan oleh film-film dengan jumlah peringkat rendah, batas 250 ulasan 
# yang digunakan untuk mendapatkan film populer
popularMovies = movieStats['rating']['size'] >= 250
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:5]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Schindler's List (1993),298,4.466443
"Shawshank Redemption, The (1994)",283,4.44523
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),583,4.358491
One Flew Over the Cuckoo's Nest (1975),264,4.291667


In [None]:
# recMovies memegang semua film populer yang disebutkan di atas bersama dengan setiap film yang mirip dengan 'Star Wars (1977)'
recMovies = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))
recMovies = recMovies.sort_values(['similarity'], ascending=False)
recMovies.iloc[1:].head(10)

  sort=sort,


Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Empire Strikes Back, The (1980)",367,4.20436,0.747981
Return of the Jedi (1983),507,4.00789,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Indiana Jones and the Last Crusade (1989),331,3.930514,0.350107
L.A. Confidential (1997),297,4.161616,0.319065
E.T. the Extra-Terrestrial (1982),300,3.833333,0.303619
Back to the Future (1985),350,3.834286,0.274839
Jaws (1975),280,3.775,0.265459
"Terminator, The (1984)",301,3.933555,0.262255
"Princess Bride, The (1987)",324,4.17284,0.259711
