## 콘텐츠 기반 필터링
---

In [120]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')
movies = pd.read_csv('/Users/young/dataset_local/tmdb_5000_movies.csv')


In [123]:
movies.columns
movies[['id','title','genres','vote_average','vote_count','popularity','keywords','overview']]
movies['genres'][1]

print(movies[['genres']][:1].values)
print(movies[['keywords']][:1].values)

[[list(['Action', 'Adventure', 'Fantasy', 'Science Fiction'])]]
[[list(['culture clash', 'future', 'space war', 'space colony', 'society', 'space travel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alien planet', 'cgi', 'marine', 'soldier', 'battle', 'love affair', 'anti war', 'power relations', 'mind and soul', '3d'])]]


#### 데이터 전처리
- keywords, genres 열의 데이터의 형태가 이중리스트로 쌓여있는 형태. 1차원 배열로 정리한다

In [122]:
from ast import literal_eval

movies['genres'] = movies['genres'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)

movies['genres'] = movies['genres'].apply(lambda x : [i['name'] for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i['name'] for i in x])
movies['genres_literal'] = movies['genres'].apply(lambda x : (' ').join(x))
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,genres_literal
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action Adventure Fantasy Science Fiction
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Adventure Fantasy Action
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Action Adventure Crime
3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Action Crime Drama Thriller
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion, space travel...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Action Adventure Science Fiction


#### 코사인 유사도 계산.
- 장르 데이터를 기반으로 유사도를 계산하기 위해 장르를 벡터화해준다.

In [124]:
from sklearn.feature_extraction.text import CountVectorizer

counter_vect = TfidfVectorizer(min_df=0,ngram_range=(1,2))
genre_mat = counter_vect.fit_transform(movies['genres_literal'])

from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat,genre_mat)
genre_sim.shape


(4803, 4803)

##### 각각의 유사도 계산 후, 인덱스로 치환하여 내림차순 정렬
argsort() 함수는 주어진 배열에서 원소들을 정렬했을 때의 순서를 반환합니다. 따라서 genre_sim.argsort()는 genre_sim 배열의 각 행(row)을 정렬했을 때 각 원소의 인덱스를 반환합니다.

[:,::-1]는 배열의 모든 행(row)에 대해 역순으로 정렬된 인덱스를 반환합니다. ":"는 모든 행(row)을 나타내며, ::-1은 역순으로 정렬된 인덱스를 반환합니다.

결과적으로 genre_sim_sorted_ind는 genre_sim 배열의 각 행(row)에 대해 유사도(similarity)가 높은 장르들의 인덱스가 내림차순으로 정렬된 배열입니다.

In [52]:
genre_sim_sorted_ind = genre_sim.argsort()[:,::-1]
len(genre_sim_sorted_ind[1])
genre_sim_sorted_ind

array([[   0,   14,   46, ..., 3038, 3037, 2401],
       [ 329, 2390,  206, ..., 3069, 3067, 2401],
       [   2, 1740, 1542, ..., 3000, 2999, 2401],
       ...,
       [4800, 3809, 3285, ..., 2229, 2230,    0],
       [4802, 1594, 1596, ..., 3204, 3205,    0],
       [4802, 4710, 4521, ..., 3140, 3141,    0]])

### 가중평균하지 않은 상태의 추천. 잘못된 예시

In [129]:
def find_sim_movie(df,sorted_ind,title_name,top_n=10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]

    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)

    return df.iloc[similar_indexes]


similiar_movies = find_sim_movie(movies,genre_sim_sorted_ind,'The Godfather',10)
similiar_movies[['title','vote_average']]

movies[['title','vote_average','vote_count']].sort_values('vote_average',ascending=False)[:10]

[[1663 1243 4065 1464 2731 3887  892 1881 1370 3378]]


Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


### 상대적인 평가지를 고려하기 위한 장치 -IMDB, 위대한 인기 지수
이 공식은 투표 시스템에서 인기도를 계산하기 위해 사용되는 유명한 공식

$$ (\dfrac{v}{v+m})R + (\dfrac{m}{v+m})C $$
- v : 항목에 대한 평가 횟수
- m : 최소 투표 횟수
- R : 개별 항목의 평점 평균
- C : 전체 항목의 평점 평균

*항목의 평가 수(v)가 적을 경우에도 항목의 인기도를 측정할 수 있도록 해줍니다.*

공식은 다음과 같이 해석됩니다.

- (v/(v+m))R: 해당 항목에 대한 평점의 가중치입니다. 투표 수(v)가 높을수록 가중치가 높아집니다.
- (m/(v+m))C: 전체 항목에 대한 평균 평점의 가중치입니다. 최소 투표 수(m)가 높을수록 가중치가 높아집니다.

따라서, 이 공식을 사용하면 개별 항목의 평점 평균(R)을 기반으로 해당 항목의 인기도를 측정할 수 있으며, 이를 통해 항목 간에 공정한 비교를 할 수 있습니다.

In [130]:
C = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.6)

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    return (v/(v+m))*R + (m/(v+m)*C)

movies['weighted_vote'] = movies.apply(lambda x : weighted_vote_average(x),axis=1)
movies
movies[['title','vote_average','weighted_vote','vote_count']].sort_values('weighted_vote',ascending=False)

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.136930,12002
...,...,...,...,...
2122,Epic Movie,3.2,4.737894,326
242,Fantastic Four,4.4,4.636554,2278
3746,The Boy Next Door,4.1,4.629738,1022
210,Batman & Robin,4.2,4.591725,1418


In [80]:
def find_sim_movie(df,sorted_ind,title_name,top_n=10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    similar_indexes = sorted_ind[title_index, :(top_n)]
    similar_indexes = similar_indexes.reshape(-1)

    similar_indexes = similar_indexes[similar_indexes != title_index]

    return df.iloc[similar_indexes].sort_values('weighted_vote',ascending=False)

similiar_movies = find_sim_movie(movies,genre_sim_sorted_ind,'The Godfather',10)
similiar_movies[['title','vote_average','weighted_vote']]


Unnamed: 0,title,vote_average,weighted_vote
1881,The Shawshank Redemption,8.5,8.396052
2731,The Godfather: Part II,8.3,8.079586
1663,Once Upon a Time in America,8.2,7.657811
3887,Trainspotting,7.8,7.591009
892,Casino,7.8,7.42304
1243,Mean Streets,7.2,6.626569
1370,21,6.5,6.41349
3378,Auto Focus,6.1,6.0932
4065,Mi America,0.0,6.092172
1464,Black Water Transit,0.0,6.092172


## 아이템 기반 협업 필터링
---

In [132]:
import pandas as pd
import numpy as np

movies = pd.read_csv('/Users/young/dataset_local/ml-latest-small/movies.csv')
ratings = pd.read_csv('/Users/young/dataset_local/ml-latest-small/ratings.csv')

print(movies.shape)
print(ratings.shape)

ratings.head()

(9742, 3)
(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [133]:
sp_df = pd.pivot_table(ratings,index='userId',columns='movieId',values='rating')
rating_movies = pd.merge(ratings,movies,on='movieId')
sp_df = rating_movies.pivot_table('rating',index='userId',columns='title')
sp_df = sp_df.fillna(0)

#### 유사도 측정을 위해 행렬 transpose

In [135]:
sp_df_T = sp_df.transpose()

from sklearn.metrics.pairwise import cosine_similarity
items_sim = cosine_similarity(sp_df_T,sp_df_T)

items_sim = pd.DataFrame(items_sim,columns=sp_df.columns,index=sp_df.columns)
items_sim['Godfather, The (1972)'].sort_values(ascending=False)[:6]

title
Godfather, The (1972)                        1.000000
Godfather: Part II, The (1974)               0.821773
Goodfellas (1990)                            0.664841
One Flew Over the Cuckoo's Nest (1975)       0.620536
Star Wars: Episode IV - A New Hope (1977)    0.595317
Fargo (1996)                                 0.588614
Name: Godfather, The (1972), dtype: float64