# 협업필터링(Collaborative Filtering)

## 사용자기반 협업 필터링(User Based Collaborative Filtering)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
ratings = pd.read_csv('./movie_data/ratings.csv')

In [4]:
ratings.shape

(100004, 4)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
!dir movie_data

 D 드라이브의 볼륨에는 이름이 없습니다.
 볼륨 일련 번호: BC0D-FB82

 D:\Pywork\DataScience\04_AI\04_응용\02_추천시스템\01_협업필터링\movie_data 디렉터리

2023-01-31  오전 10:10    <DIR>          .
2023-01-31  오전 10:10    <DIR>          ..
2023-01-31  오전 09:57           458,390 movies.csv
2023-01-31  오전 09:57         2,438,266 ratings.csv
2023-01-31  오전 09:57           674,990 ratings.csv.zip
2023-01-31  오전 09:57         2,438,266 ratings_small.csv
               4개 파일           6,009,912 바이트
               2개 디렉터리  519,896,707,072 바이트 남음


In [10]:
movies = pd.read_csv('./movie_data/movies.csv')

In [11]:
movies.shape

(9125, 3)

In [12]:
ratings_movies = pd.merge(ratings, movies, on = 'movieId')

In [13]:
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama


In [18]:
ratings_matrix = ratings.pivot_table(index=['movieId'], columns = ['userId'], values = 'rating').reset_index(drop=True)
ratings_matrix.fillna(0, inplace = True)
ratings_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
movie_similarity = cosine_similarity(ratings_matrix)
np.fill_diagonal(movie_similarity, 0)
movie_similarity

array([[0.        , 0.39451145, 0.30651588, ..., 0.        , 0.        ,
        0.05582876],
       [0.39451145, 0.        , 0.21749153, ..., 0.        , 0.        ,
        0.        ],
       [0.30651588, 0.21749153, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.05582876, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [22]:
ratings_matrix = pd.DataFrame(movie_similarity)
ratings_matrix.head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,0.0,0.394511,0.306516,0.133614,0.245102,0.377086,0.278629,0.063031,0.117499,0.310689,...,0.055829,0.031902,0.079755,0.079755,0.079755,0.079755,0.079755,0.0,0.0,0.055829
1,0.394511,0.0,0.217492,0.164651,0.278476,0.222003,0.207299,0.223524,0.113669,0.418124,...,0.0,0.055038,0.068797,0.082557,0.082557,0.137594,0.068797,0.0,0.0,0.0
2,0.306516,0.217492,0.0,0.177012,0.370732,0.247499,0.435648,0.127574,0.306717,0.191255,...,0.0,0.0,0.0,0.116226,0.116226,0.0,0.0,0.0,0.0,0.0
3,0.133614,0.164651,0.177012,0.0,0.179556,0.072518,0.184626,0.501513,0.25463,0.111447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.245102,0.278476,0.370732,0.179556,0.0,0.272645,0.388476,0.194113,0.367941,0.246846,...,0.0,0.176845,0.0,0.117897,0.117897,0.0,0.0,0.0,0.0,0.0
5,0.377086,0.222003,0.247499,0.072518,0.272645,0.0,0.278855,0.097561,0.248155,0.307948,...,0.061724,0.098758,0.111103,0.0,0.0,0.0,0.111103,0.0,0.0,0.061724
6,0.278629,0.207299,0.435648,0.184626,0.388476,0.278855,0.0,0.196091,0.349827,0.177425,...,0.079399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079399
7,0.063031,0.223524,0.127574,0.501513,0.194113,0.097561,0.196091,0.0,0.264477,0.042169,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.117499,0.113669,0.306717,0.25463,0.367941,0.248155,0.349827,0.264477,0.0,0.130475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.310689,0.418124,0.191255,0.111447,0.246846,0.307948,0.177425,0.042169,0.130475,0.0,...,0.0,0.076835,0.076835,0.102446,0.102446,0.0,0.076835,0.0,0.0,0.0


In [24]:
try:
    user_inp = 'Jumanji (1995)'
    inp = movies[movies['title'] == user_inp].index.tolist()
    inp = inp[0]
    
    movies['similarity'] = ratings_matrix.iloc[inp]
    movies.head()

except:
    print('Sorry, the movie is not in database!')

In [25]:
print("Recommended movies based on your choice of ", user_inp, ": \n",
      movies.sort_values(['similarity'], ascending = False)[1:10])

Recommended movies based on your choice of  Jumanji (1995) : 
      movieId                              title  \
328      364              Lion King, The (1994)   
283      317           Santa Clause, The (1994)   
331      367                   Mask, The (1994)   
527      595        Beauty and the Beast (1991)   
521      588                     Aladdin (1992)   
309      344  Ace Ventura: Pet Detective (1994)   
520      587                       Ghost (1990)   
427      480               Jurassic Park (1993)   
341      377                       Speed (1994)   

                                              genres  similarity  
328  Adventure|Animation|Children|Drama|Musical|IMAX    0.530357  
283                             Comedy|Drama|Fantasy    0.505831  
331                      Action|Comedy|Crime|Fantasy    0.494605  
527  Animation|Children|Fantasy|Musical|Romance|IMAX    0.494124  
521      Adventure|Animation|Children|Comedy|Musical    0.493995  
309                     

# 그외 응용

## 아이템 기반 협업 필터링 (Item Based Collaborative Filtering)

### 협업기반 필터링 추천방식의 한계
 * 평가점수(또는 별점) 혹은 사용여부(시청)정보가 있어야 한다.
 * 신규 컨텐츠를 추천하기 어렵다 => Cold Start

---

# 컨텐츠기반 필터링 (Content Based Filtering)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import codecs
import pickle
import pandas as pd

In [30]:
movie_info_file = 'u.item'
f = open(movie_info_file, 'r', encoding = 'latin-1')
lines = f.readlines()
movie_title_list = []
for line in lines:
    movie_title = line.split(sep = '|', maxsplit = 2)
    movie_title_list.append(movie_title[1])
    
f.close()

In [31]:
len(movie_title_list)

1682

In [32]:
movie_title_list[:10]

['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Twelve Monkeys (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'Richard III (1995)']

In [33]:
movie_plot_file = 'ml-100k-plot.txt'

f = open(movie_plot_file, 'r', encoding = 'latin-1')
lines = f.readlines()
movie_plot_list = []
for line in lines:
    movie_plot = line.split(sep = '|', maxsplit = 1)
    movie_plot_list.append(movie_plot[1])
    
f.close()

len(movie_plot_list)

1682

In [34]:
movie_plot_list[:3]

['A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy\'s new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.\n',
 'When a deadly satellite weapon system falls into the wrong hands, only Agent 007 can save the world from certain disaster. Armed with his license to kill, Bond races to Russia in search of the stolen access codes for "Go

In [35]:
vectorizer = TfidfVectorizer(min_df = 2, stop_words=['of','is','this','that','which'])

In [36]:
X = vectorizer.fit_transform(movie_plot_list)
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['000', '007', '10', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [37]:
movie_sim = cosine_similarity(X)
movie_sim[:3]

array([[1.        , 0.0214585 , 0.05025098, ..., 0.04759486, 0.02297467,
        0.03758619],
       [0.0214585 , 1.        , 0.04526604, ..., 0.04246676, 0.0256056 ,
        0.06447316],
       [0.05025098, 0.04526604, 1.        , ..., 0.04910487, 0.07723813,
        0.10361089]])

In [None]:
def similar_recomend_by_movie_id(movie_id, n_recommand):
    movie_index = movie_id - 1
    # enumerate 함수로 [(리스트 인덱스 0, 유사도 0), (리스트 인덱스 1, 유사도 1)...]의
      # 리스트를 만듭니다. 그 후 각 튜플의 두 번째 항목, 즉 유사도를 이용하여 내림차순 정렬합니다.
      # 이렇게 만든 리스트의 가장 앞 튜플의 첫 번째 항목이 영화 ID가 됩니다.
    similar_movies = sorted(list(enumerate(movie_sim[movie_index])), key = lambda x:x[1], reverse = True)
    recommended = 1
    print