# Cosine Similarity

### ![image.png](attachment:image.png)

## 1. 코사인 유사도

In [1]:
from numpy import dot
from numpy.linalg import norm
import numpy as np

def cos_sim(A,B):
    return dot(A,B) / (norm(A)*norm(B))

In [2]:
doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

In [3]:
cos_sim(doc1,doc2), cos_sim(doc1,doc3), cos_sim(doc2,doc3)

(0.6666666666666667, 0.6666666666666667, 1.0000000000000002)

## 2. 코사인 유사도를 이용한 추천 시스템 구현

In [4]:
import pandas as pd

In [6]:
movie = pd.read_csv('movies_metadata.csv', low_memory=False)

In [7]:
movie.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
movie.shape

(45466, 24)

In [10]:
movie = movie.head(20000)

In [12]:
# overview of the movie ( text data )
movie['overview'].isnull().sum()

135

In [13]:
movie['overview'] = movie['overview'].fillna(' ')

### TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie['overview'])

In [19]:
# 47487 words are used!
tfidf_matrix.shape

(20000, 47487)

### Cosine Similarity

In [22]:
from sklearn.metrics.pairwise import linear_kernel

In [23]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [30]:
print(cosine_sim.shape)
cosine_sim

(20000, 20000)


array([[1.        , 0.01575748, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01575748, 1.        , 0.04907345, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04907345, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.08375766],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08375766, 0.        ,
        1.        ]])

In [39]:
# (0번영화인 Toystory) & (1~20000영화) 사이의 similarity
cosine_sim[0]

array([1.        , 0.01575748, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [40]:
list(enumerate(cosine_sim[0]))

[(0, 1.0),
 (1, 0.01575747731678539),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.04113995020790759),
 (18, 0.0),
 (19, 0.0),
 (20, 0.009940909498428007),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.019830795451996588),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0063477440216163795),
 (43, 0.0),
 (44, 0.0),
 (45, 0.00932055532825944),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.013821954288971787),
 (50, 0.00985720337272072),
 (51, 0.010960042528831586),
 (52, 0.0),
 (53, 0.0),
 (54, 0.019958825373755495),
 (55, 0.0),
 (56, 0.02534679843095959),
 (57, 0.020764246377724643),
 (58, 0.0),
 (59, 0.03424230329305141),
 (60, 0.0),
 (61, 0.0),
 (62, 0.00860978874084069),
 (63, 0.0),
 (64, 0.01

In [38]:
# 0번 영화  & 1번 영화의 코사인 유사도
cosine_sim[0][1]

0.01575747731678539

### Recommendation

In [24]:
# 영화 타이틀 입력 시, 인덱스 return
indices = pd.Series(movie.index, index=movie['title']).drop_duplicates()

In [25]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [41]:
def get_rec(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x :x[1], reverse=True)    
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores] # 가장 유사한 TOP 10영화의 index    
    return movie['title'].iloc[movie_indices]

In [42]:
get_rec('The Dark Knight Rises')

12481                            The Dark Knight
150                               Batman Forever
1328                              Batman Returns
15511                 Batman: Under the Red Hood
585                                       Batman
9230          Batman Beyond: Return of the Joker
18035                           Batman: Year One
19792    Batman: The Dark Knight Returns, Part 1
3095                Batman: Mask of the Phantasm
10122                              Batman Begins
Name: title, dtype: object