In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

In [2]:
base_path = "data/movie/"
os.listdir(base_path)

['links.csv',
 '.DS_Store',
 'tags.csv',
 'movies_metadata.csv',
 'ratings.csv',
 'README.txt',
 'movies.csv']

## Data Load

In [3]:
# movies_metadata.csv 파일 불러오기
df = pd.read_csv(f"{base_path}movies_metadata.csv")
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(45466, 24)

## Valid Data

In [4]:
# overview 가 결측치가 아닌 것만 사용하기
df = df[df['overview'].notnull()].copy()
df.shape

(44512, 24)

## Sampling

In [5]:
# "title", "overview" 컬럼만 사용하며, 일부만 샘플링하여 사용
df = df.sample(10000, random_state=42).copy()
df.shape

(10000, 24)

## TF-IDF

TF-IDF(Term Frequency - Inverse Document Frequency)

정보 검색과 텍스트 마이닝에서 이용하는 가중치로, 여러 문서로 이루어진 문서군이 있을 때 어떤 단어가 특정 문서 내에서 얼마나 중요한 것인지를 나타내는 통계적 수치이다. 문서의 핵심어를 추출하거나, 검색 엔진에서 검색 결과의 순위를 결정하거나, 문서들 사이의 비슷한 정도를 구하는 등의 용도로 사용할 수 있다.

TF(단어 빈도, term frequency)는 특정한 단어가 문서 내에 얼마나 자주 등장하는지를 나타내는 값으로, 이 값이 높을수록 문서에서 중요하다고 생각할 수 있다. 하지만 단어 자체가 문서군 내에서 자주 사용 되는 경우, 이것은 그 단어가 흔하게 등장한다는 것을 의미한다. 이것을 DF(문서 빈도, document frequency)라고 하며, 이 값의 역수를 IDF(역문서 빈도, inverse document frequency)라고 한다. TF-IDF는 TF와 IDF를 곱한 값이다.

IDF 값은 문서군의 성격에 따라 결정된다. 예를 들어 '원자'라는 낱말은 일반적인 문서들 사이에서는 잘 나오지 않기 때문에 IDF 값이 높아지고 문서의 핵심어가 될 수 있지만, 원자에 대한 문서를 모아놓은 문서군의 경우 이 낱말은 상투어가 되어 각 문서들을 세분화하여 구분할 수 있는 다른 낱말들이 높은 가중치를 얻게 된다.

* 출처 : [tf-idf - 위키백과, 우리 모두의 백과사전](https://ko.wikipedia.org/wiki/Tf-idf)



## TfidfVectorizer

* API documentation:https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

* norm='l2' 각 문서의 피처 벡터를 어떻게 벡터 정규화 할지 정한다. 
    - L2 : 벡터의 각 원소의 제곱의 합이 1이 되도록 만드는 것이고 기본 값
    - L1 : 벡터의 각 원소의 절댓값의 합이 1이 되도록 크기를 조절
* smooth_idf=False
    - 피처를 만들 때 0으로 나오는 항목에 대해 작은 값을 더해서(스무딩을 해서) 피처를 만들지 아니면 그냥 생성할지를 결정
* sublinear_tf=False
* use_idf=True
    - TF-IDF를 사용해 피처를 만들 것인지 아니면 단어 빈도 자체를 사용할 것인지 여부


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer(max_features=20000)

In [7]:
tfidf = tfidfvec.fit_transform(df['overview'])
tfidf.shape

(10000, 20000)

In [8]:
tfidfvec.get_feature_names_out()

array(['00', '000', '007', ..., 'он', 'русский', 'совсем'], dtype=object)

In [9]:
df_tfidf = pd.DataFrame(tfidf.toarray(), columns=tfidfvec.get_feature_names_out())
df_tfidf

Unnamed: 0,00,000,007,10,100,1000,100th,101,105,108,...,που,τους,до,живёт,знает,на,не,он,русский,совсем
0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.21363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_matrix = cosine_similarity(df_tfidf)
cosine_matrix

array([[1.        , 0.00951212, 0.00616198, ..., 0.05623646, 0.00299544,
        0.00337578],
       [0.00951212, 1.        , 0.04365479, ..., 0.01941736, 0.04122309,
        0.02272898],
       [0.00616198, 0.04365479, 1.        , ..., 0.00715548, 0.03143502,
        0.03037689],
       ...,
       [0.05623646, 0.01941736, 0.00715548, ..., 1.        , 0.01956266,
        0.02204657],
       [0.00299544, 0.04122309, 0.03143502, ..., 0.01956266, 1.        ,
        0.05520794],
       [0.00337578, 0.02272898, 0.03037689, ..., 0.02204657, 0.05520794,
        1.        ]])

In [11]:
df['title'].unique()

array(["All Tomorrow's Parties", 'Troy', 'Variety', ..., 'Zachariah',
       'Winged Creatures', 'Delirious'], dtype=object)

In [12]:
df_cosine = pd.DataFrame(cosine_matrix, index=df.index, columns=df.index)
df_cosine

Unnamed: 0,15605,7291,14582,34668,44660,11163,40205,26168,19389,25067,...,628,42006,38052,21731,31711,25543,30517,3118,13950,5203
15605,1.000000,0.009512,0.006162,0.014754,0.011717,0.005990,0.014085,0.000000,0.015787,0.010965,...,0.015698,0.014510,0.006771,0.011267,0.015716,0.006496,0.015819,0.056236,0.002995,0.003376
7291,0.009512,1.000000,0.043655,0.057248,0.024488,0.032627,0.030594,0.006685,0.082402,0.092587,...,0.041464,0.085786,0.058484,0.106627,0.079446,0.012103,0.056601,0.019417,0.041223,0.022729
14582,0.006162,0.043655,1.000000,0.024484,0.052140,0.019180,0.009830,0.000000,0.062427,0.029193,...,0.028234,0.044538,0.051459,0.017560,0.040642,0.010608,0.036926,0.007155,0.031435,0.030377
34668,0.014754,0.057248,0.024484,1.000000,0.025402,0.024705,0.044855,0.003937,0.092864,0.068365,...,0.054923,0.063937,0.031423,0.054493,0.060886,0.021111,0.072243,0.030188,0.021597,0.015682
44660,0.011717,0.024488,0.052140,0.025402,1.000000,0.017620,0.027838,0.000000,0.040503,0.020547,...,0.022486,0.014360,0.032374,0.019397,0.028991,0.011184,0.027810,0.003801,0.011666,0.013147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25543,0.006496,0.012103,0.010608,0.021111,0.011184,0.008838,0.016894,0.000000,0.024271,0.014568,...,0.017791,0.006747,0.011079,0.014528,0.017474,1.000000,0.038149,0.007411,0.013326,0.012428
30517,0.015819,0.056601,0.036926,0.072243,0.027810,0.048428,0.033647,0.008375,0.115699,0.077837,...,0.066056,0.044153,0.059011,0.094128,0.061841,0.038149,1.000000,0.029435,0.055237,0.041783
3118,0.056236,0.019417,0.007155,0.030188,0.003801,0.030627,0.007556,0.000000,0.086111,0.012663,...,0.019917,0.009068,0.010319,0.005713,0.015047,0.007411,0.029435,1.000000,0.019563,0.022047
13950,0.002995,0.041223,0.031435,0.021597,0.011666,0.035360,0.011569,0.004576,0.041608,0.018522,...,0.026894,0.027884,0.041187,0.030594,0.043764,0.013326,0.055237,0.019563,1.000000,0.055208


In [13]:
title = "Harry Potter"
df.loc[(df['title'].str.contains(title)) & (df['title'].notnull()), "title"]

16128    Harry Potter and the Deathly Hallows: Part 1
5678          Harry Potter and the Chamber of Secrets
7725         Harry Potter and the Prisoner of Azkaban
Name: title, dtype: object

In [14]:
sim_cos = df_cosine[16128]

In [15]:
df_sim_cos = pd.DataFrame({"title": df["title"], "overview": df["overview"], "similarity": sim_cos}).sort_values("similarity", ascending=False)[:10]
df_sim_cos

Unnamed: 0,title,overview,similarity
16128,Harry Potter and the Deathly Hallows: Part 1,"Harry, Ron and Hermione walk away from their l...",1.0
5678,Harry Potter and the Chamber of Secrets,"Ignoring threats to his life, Harry returns to...",0.31585
7725,Harry Potter and the Prisoner of Azkaban,"Harry, Ron and Hermione return to Hogwarts for...",0.311968
18622,A Very Potter Sequel,Harry and his pals are back for more adventure...,0.22376
31304,"Cherry, Harry & Raquel!",Harry (a corrupt sheriff) and his Chicano depu...,0.195887
3270,Let's Get Harry,Harry Burck has been kidnapped by South Americ...,0.174236
24750,It's a Small World,Harry Musk is one in a million. That means tha...,0.170054
41184,Harry & Snowman,"Dutch immigrant, Harry deLeyer, journeyed to t...",0.166933
4831,Porn Star: The Legend of Ron Jeremy,Porn Star: The Legend of Ron Jeremy offers aud...,0.162652
42952,Penny Points to Paradise,When Harry and Spike visit Bristol to spend th...,0.153549


In [16]:
def find_movie(title, sim_matrix, df):
    try:
        title_idx = \
            df.loc[(df['title'].str.contains(title)) & (df['title'].notnull()), "title"].index[0]
        sim_cos = pd.DataFrame(sim_matrix, index=df.index, columns=df.index)[title_idx]
        
        return pd.DataFrame({"title": df["title"], "overview": df["overview"], "similarity": sim_cos})\
            .sort_values("similarity", ascending=False)[:10]
    except:
        return "Can't find the movie"

find_movie("Troy", cosine_matrix, df)

Unnamed: 0,title,overview,similarity
7291,Troy,"In year 1250 B.C. during the late Bronze age, ...",1.0
7756,Helen of Troy,"Prince Paris of Troy, shipwrecked on a mission...",0.353629
14045,Christopher Columbus: The Discovery,Genoan navigator Christopher Columbus has a dr...,0.145888
43905,Hector,Hector's parents sent him to an orphanage in t...,0.143693
43973,The Scarlet Flower,"Before going on an overseas journey, a merchan...",0.142244
23621,August,"August tells the story of two former lovers, T...",0.138235
6071,Trouble Bound,"Upon getting out of prison, a man who took the...",0.136867
24399,A Modern Coed,Eric Rohmer directs this short documentary tha...,0.136528
25399,Beyond Justice,"A woman's ex-husband, who is the son of an Ara...",0.134644
34991,Johnny Hamlet,"On his way back from the Civil War, Johnny Ham...",0.13324


In [17]:
find_movie("aslkdjlk", cosine_matrix, df)

"Can't find the movie"

In [18]:
from sklearn.metrics.pairwise import euclidean_distances

ed_matrix = 1 / euclidean_distances(df_tfidf, df_tfidf)

  """


In [19]:
ed_matrix

array([[           inf, 7.10494006e-01, 7.09295491e-01, ...,
        7.27869303e-01, 7.08168216e-01, 7.08303331e-01],
       [7.10494006e-01, 6.71088640e+07, 7.23065531e-01, ...,
        7.14073476e-01, 7.22148011e-01, 7.15282324e-01],
       [7.09295491e-01, 7.23065531e-01,            inf, ...,
        7.09650283e-01, 7.18489822e-01, 7.18097679e-01],
       ...,
       [7.27869303e-01, 7.14073476e-01, 7.09650283e-01, ...,
        6.71088640e+07, 7.14126387e-01, 7.15032721e-01],
       [7.08168216e-01, 7.22148011e-01, 7.18489822e-01, ...,
        7.14126387e-01,            inf, 7.27473009e-01],
       [7.08303331e-01, 7.15282324e-01, 7.18097679e-01, ...,
        7.15032721e-01, 7.27473009e-01,            inf]])

In [20]:
# find_movie
find_movie("Star Wars", ed_matrix, df)

Unnamed: 0,title,overview,similarity
5244,Star Wars: Episode II - Attack of the Clones,"Ten years after the invasion of Naboo, the gal...",47453130.0
13346,Game Over,...,1.0
36785,Wojaczek,x,1.0
4538,Slaves of New York,,1.0
2514,Star Wars: Episode I - The Phantom Menace,"Anakin Skywalker, a young slave strong with th...",0.8594775
12885,Star Wars: The Clone Wars,Set between Episode II and III the Clone Wars ...,0.8344442
42981,The Invisible Guardian,When the naked body of a teenage girl is found...,0.765653
12924,Oh! What a Lovely War,A movie about the First World War based on a s...,0.7650689
15377,The Ashes,"Set in the time of Napoleon wars, shows how th...",0.7599538
40660,The Brain Eaters,"Strange things are happening in Riverdale, Ill...",0.757952


## 피어슨 유사도

- 두 벡터가 주어졌을 때의 상관관계를 계산하며 각 벡터의 표본평균으로 정규화하고, 코사인 유사도를 산출하면 피어슨 유사도
- 피어슨 유사도가 1이면 양의 상관관계, -1이면 음의 상관관계, 0이면 상관관계가 없음(독립)을 의미
- 장점: 양적 변수들 사이의 선형관계를 확인하기 쉬움. 코사인 유사도의 평점 부분에 각 유저의 평균값을 뺀 값 확인 가능 
- 단점: 코사인 유사도와 마찬가지로 벡터가 지닌 스칼라들의 값의 크기에 대한 고려를 하지 않음


In [21]:
pearson_sim = np.corrcoef(df_tfidf)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [22]:
# find_movie
find_movie("Star Wars", pearson_sim, df)

Unnamed: 0,title,overview,similarity
5244,Star Wars: Episode II - Attack of the Clones,"Ten years after the invasion of Naboo, the gal...",1.0
2514,Star Wars: Episode I - The Phantom Menace,"Anakin Skywalker, a young slave strong with th...",0.322232
12885,Star Wars: The Clone Wars,Set between Episode II and III the Clone Wars ...,0.280518
42981,The Invisible Guardian,When the naked body of a teenage girl is found...,0.144663
12924,Oh! What a Lovely War,A movie about the First World War based on a s...,0.143501
15377,The Ashes,"Set in the time of Napoleon wars, shows how th...",0.132747
40660,The Brain Eaters,"Strange things are happening in Riverdale, Ill...",0.127644
32490,The Fall of the Essex Boys,The rise and fall of the Essex Boys gang - the...,0.127457
19678,This Ain’t California,This Ain't California is a celebration of the ...,0.126044
21236,Gagarin: First in Space,The film is dedicated to the first steps of ma...,0.126031
