In [None]:
import pandas as pd

# Three most common Recommender System

- **Demographic Filtering**
    - Rekomendasi yang general (tidak sepenuhnya personalized) -> **Top 50 movies of the year**
    - Filter hanya berdasarkan fitur demographic seperti genre, durasi, dll.
    - The simplest recommender system
    - Ide: Rekomendasikan apa yang secara umum disukai orang

- **Content Based Filtering**
    - Rekomendasi barang yang serupa -> **Other movies you may like**
    - Filter menggunakan fitur yang lebih spesifik seperti genre, film director, synopsis, aktor, dll. 
    - Ide: Jika seseorang menonton X, maka dia akan direkomendasikan film yang mirip dengan X

- **Collaborative Filtering**
    - Mencocokkan orang dengan preferensi yang serupa -> **Other people also watched**
    - Tidak membutuhkan filter data apapun, hanya perlu mencari similarity dengan yang lain

# Simple Demographic Filtering: Filter -> Scoring -> Sort

In [None]:
df = pd.read_csv("data/demographic.csv")
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Step 1: Filter

In [None]:
genre = ["Animation"]
duration = (60, 150)
year = (2000, 2019)
topN = 20

In [None]:
df = df[df.runtime.between(duration[0], duration[1]) &
        df.release_year.between(year[0], year[1]) &
        (df[genre] == 1).all(axis=1)]
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
3161,The Tigger Movie,Animation; Family,77.0,6.3,146.0,2000,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3354,The Road to El Dorado,Adventure; Animation; Comedy; Family,89.0,7.0,892.0,2000,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3484,Dinosaur,Animation; Family,82.0,6.2,563.0,2000,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3614,Titan A.E.,Animation; Action; Science Fiction; Family; Ad...,94.0,6.3,320.0,2000,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
3619,Chicken Run,Animation; Comedy; Family,84.0,6.5,1190.0,2000,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


## Step 2: Scoring

Kita pakai `vote_average` sebagai score

## Step 3: Sort

In [None]:
recommendation = df[["title", "genres", "runtime", "vote_average", "vote_count", "release_year"]]
recommendation.sort_values("vote_average", ascending=False).head(topN)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
25671,Rocks in my Pockets,Comedy; Animation; Drama,88.0,9.4,5.0,2014
41890,Kizumonogatari Part 3: Reiketsu,Animation; Fantasy; Mystery,82.0,9.4,5.0,2017
26636,Lotte from Gadgetville,Adventure; Animation; Comedy; Family,81.0,9.0,4.0,2006
41846,Kizumonogatari Part 2: Nekketsu,Animation; Fantasy; Mystery,68.0,8.9,11.0,2016
35577,"Fuse, Memoirs of the Hunter Girl",Action; Animation; Drama; History,110.0,8.8,4.0,2012
43038,In This Corner of the World,Animation; Drama,128.0,8.7,19.0,2016
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
28618,The Life and Mind of Mark DeFriest,Animation; Documentary,92.0,8.5,2.0,2014
41695,The Snow Queen 3: Fire and Ice,Family; Animation; Fantasy,80.0,8.5,2.0,2016
26159,The Littlest Angel,Animation; Family,84.0,8.5,2.0,2011


## IMDB weighted Rating

Film dengan 100 orang yang rate 9.5 seakan-akan lebih baik daripada 10000 orang yang rate 9.0.<br>
Harusnya dibobot dengan jumlah orang yang vote juga. Kita akan pakai IMDB weighted rating sebagai berikut:

\begin{equation}
WR = \frac{Rv+Cm}{v+m}
\end{equation}

- $v$: jumlah voting film tersebut -> (vote_count)
- $m$: syarat minimum vote -> kita akan pakai quantile
- $R$: rata-rata rating film itu -> (vote_average)
- $C$: rata-rata rating semua film -> bisa dihitung

In [None]:
def IMDB_score(df, q=0.9):
    df = df.copy()
    m = df.vote_count.quantile(q)
    C = (df.vote_average * df.vote_count).sum()/df.vote_count.sum()

    df = df[df.vote_count >= m]
    
    df["score"] = df.apply(lambda x: (x.vote_average*x.vote_count + C*m)/(x.vote_count+m), axis=1)
    return df

In [None]:
df = IMDB_score(df)

In [None]:
recommendation = df[["title", "genres", "runtime", "vote_average", "vote_count", "release_year"]]
recommendation.sort_values("vote_average", ascending=False).head(topN)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
5471,Spirited Away,Fantasy; Adventure; Animation; Family,125.0,8.3,3968.0,2001
9687,Howl's Moving Castle,Fantasy; Animation; Adventure,119.0,8.2,2049.0,2004
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
12693,WALL·E,Animation; Family,98.0,7.8,6439.0,2008
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
15328,Toy Story 3,Animation; Family; Comedy,103.0,7.6,4710.0,2010
23489,How to Train Your Dragon 2,Fantasy; Action; Adventure; Animation; Comedy;...,102.0,7.6,3163.0,2014
