## 입력 데이터 로딩

*   영화 정보는 tmdb_5000_movies.csv라는 파일에 있다. 이 파일만 사용할 예정


In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("data/tmdb_5000_movies.csv")

In [3]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
movies.shape

(4803, 20)

In [5]:
# 샘플 DataFrame 생성
df_example = pd.DataFrame(
    {"Math": [82, 90, 76], "Science": [88, 92, 94], "English": [78, 85, 88]}
)

In [6]:
df_example.head()

Unnamed: 0,Math,Science,English
0,82,88,78
1,90,92,85
2,76,94,88


In [7]:
# 각 열의 최대값 찾기
max_values_per_column = df_example.apply(max, axis=0)

# 각 행의 평균 계산하기
average_values_per_row = df_example.apply(lambda row: row.mean(), axis=1)

print(max_values_per_column, average_values_per_row)

Math       90
Science    94
English    88
dtype: int64 0    82.666667
1    89.000000
2    86.000000
dtype: float64


In [8]:
import json


def f(j):
    genres = []
    ar = json.loads(j)
    for a in ar:
        genres.append(a.get("name"))
    return " ".join(sorted(genres))


# apply 함수를 사용할 때 axis를 1로 주어야 한 레코드내의 필드들을 액세스 가능
#                               0으로 주면
movies["genres_name"] = movies.apply(lambda x: f(x.genres), axis=1)

In [9]:
movies[["genres_name"]].head()  # vs. movies['genres_name'].head()

Unnamed: 0,genres_name
0,Action Adventure Fantasy Science Fiction
1,Action Adventure Fantasy
2,Action Adventure Crime
3,Action Crime Drama Thriller
4,Action Adventure Science Fiction


In [10]:
movies["genres_name"].nunique()

638

In [11]:
movies.groupby("genres_name").size()

genres_name
                                            28
Action                                      21
Action Adventure                            15
Action Adventure Animation                   1
Action Adventure Animation Comedy Family     5
                                            ..
Science Fiction Thriller                    12
Thriller                                    23
War                                          2
War Western                                  1
Western                                     18
Length: 638, dtype: int64

## 여러 텍스트 필드들을 모아서 텍스트 유사도에 사용할 텍스트 필드 하나를 생성

In [12]:
for f in ["original_title", "overview", "genres_name"]:
    movies[f] = movies[f].fillna("")

In [13]:
def combine_features(row):
    try:
        return row["original_title"] + " " + row["overview"] + " " + row["genres_name"]
    except:
        print("Error:", row)

In [14]:
movies["combined_features"] = movies.apply(combine_features, axis=1)

In [15]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,genres_name,combined_features
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action Adventure Fantasy Science Fiction,"Avatar In the 22nd century, a paraplegic Marin..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Action Adventure Fantasy,Pirates of the Caribbean: At World's End Capta...
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Action Adventure Crime,Spectre A cryptic message from Bond’s past sen...
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Action Crime Drama Thriller,The Dark Knight Rises Following the death of D...
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Action Adventure Science Fiction,"John Carter John Carter is a war-weary, former..."


In [16]:
movies = movies.reset_index()  # index라는 필드를 만들어줌
movies.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,genres_name,combined_features
0,0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action Adventure Fantasy Science Fiction,"Avatar In the 22nd century, a paraplegic Marin..."
1,1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Action Adventure Fantasy,Pirates of the Caribbean: At World's End Capta...
2,2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Action Adventure Crime,Spectre A cryptic message from Bond’s past sen...
3,3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Action Crime Drama Thriller,The Dark Knight Rises Following the death of D...
4,4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Action Adventure Science Fiction,"John Carter John Carter is a war-weary, former..."


In [17]:
movies["combined_features"].head()

0    Avatar In the 22nd century, a paraplegic Marin...
1    Pirates of the Caribbean: At World's End Capta...
2    Spectre A cryptic message from Bond’s past sen...
3    The Dark Knight Rises Following the death of D...
4    John Carter John Carter is a war-weary, former...
Name: combined_features, dtype: object

## TF-IDF 기반 벡터 생성 후 코사인 유사도로 영화들간의 유사도 계산

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
tfidfvectorizer = TfidfVectorizer(analyzer="word", stop_words="english", norm="l2")

In [20]:
tfidf_matrix = tfidfvectorizer.fit_transform(movies["combined_features"])

In [21]:
tfidf_matrix.shape  #  min_df 파라미터!!!

(4803, 22179)

In [22]:
cosine_sim = cosine_similarity(
    tfidf_matrix
)  # linear_kernel을 사용해도 동일함. tfidf 벡터가 생성될 때 L2 normalization이 되었기 때문

In [23]:
df_cosine_sim = pd.DataFrame(data=cosine_sim)
df_cosine_sim.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
0,1.0,0.034109,0.013909,0.026299,0.029273,0.046984,0.003855,0.068451,0.020614,0.020293,...,0.0,0.0,0.026697,0.057602,0.0,0.003941,0.0,0.0,0.0,0.0
1,0.034109,1.0,0.014576,0.004304,0.047903,0.023093,0.00404,0.039814,0.021603,0.029597,...,0.016368,0.044292,0.006,0.0,0.0,0.00413,0.0,0.022717,0.016449,0.0
2,0.013909,0.014576,1.0,0.008018,0.009303,0.009417,0.003084,0.041851,0.023712,0.008672,...,0.022647,0.0,0.0,0.0,0.015928,0.007694,0.0,0.011636,0.0,0.0
3,0.026299,0.004304,0.008018,1.0,0.011748,0.007161,0.014069,0.027766,0.028148,0.158541,...,0.002655,0.00302,0.001183,0.004516,0.001678,0.007343,0.0,0.029067,0.038273,0.019549
4,0.029273,0.047903,0.009303,0.011748,1.0,0.007872,0.011027,0.054503,0.005454,0.019947,...,0.010447,0.0,0.0,0.01319,0.0,0.002636,0.0,0.004744,0.0,0.0


## 컨텐츠 기반 추천 함수 만들기

In [24]:
def get_title_from_index(df, index):
    return df[df.index == index]["original_title"].values[0]


def get_index_from_title(df, title):
    return df[df.original_title == title]["index"].values[0]

In [25]:
cosine_sim[0]

array([1.        , 0.03410854, 0.01390903, ..., 0.        , 0.        ,
       0.        ])

In [26]:
for cs in enumerate(cosine_sim[0]):
    print(cs)

(0, np.float64(1.0))
(1, np.float64(0.03410853782660228))
(2, np.float64(0.013909027833476929))
(3, np.float64(0.026299406427582793))
(4, np.float64(0.029272658943387594))
(5, np.float64(0.04698429164961723))
(6, np.float64(0.0038547145246063327))
(7, np.float64(0.06845111489143473))
(8, np.float64(0.020614257715725205))
(9, np.float64(0.020292514366589152))
(10, np.float64(0.04234078479474402))
(11, np.float64(0.027006676853799445))
(12, np.float64(0.03161857450875023))
(13, np.float64(0.01012938358302969))
(14, np.float64(0.06041309047742963))
(15, np.float64(0.013328113123785307))
(16, np.float64(0.036370965857215116))
(17, np.float64(0.028506554034230434))
(18, np.float64(0.02162195731499623))
(19, np.float64(0.01754390776383305))
(20, np.float64(0.015890840282328227))
(21, np.float64(0.013010542711000998))
(22, np.float64(0.01988359152099039))
(23, np.float64(0.02889593817447863))
(24, np.float64(0.01455452908886643))
(25, np.float64(0.0))
(26, np.float64(0.06238914032771081))
(27

In [27]:
def reco_top_similar_movies(movie_title, n=10):
    movie_index = get_index_from_title(movies, movie_title)
    similar_movies = enumerate(cosine_sim[movie_index])
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)

    ret_movies = []
    i = 0
    for element in sorted_similar_movies:
        title = get_title_from_index(movies, element[0])
        ret_movies.append(title)
        i = i + 1
        if i >= n:
            break
    return ret_movies

In [28]:
print(reco_top_similar_movies("Avatar", 5))

['Avatar', 'Apollo 18', 'The American', 'Obitaemyy Ostrov', 'The Matrix']


In [29]:
print(reco_top_similar_movies("Minions", 5))

['Minions', 'Despicable Me 2', 'Stuart Little 2', 'Stuart Little', 'Austin Powers: The Spy Who Shagged Me']


In [30]:
print(reco_top_similar_movies("Harry Potter and the Half-Blood Prince", 5))

['Harry Potter and the Half-Blood Prince', 'Harry Potter and the Goblet of Fire', 'Harry Potter and the Order of the Phoenix', 'Harry Potter and the Chamber of Secrets', 'Harry Potter and the Prisoner of Azkaban']
