In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
import pandas as pd
import numpy as np

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#데이터 불러오기
data_keyword = pd.read_csv('/content/drive/MyDrive/4조/코드/Dataset/data_keyword.csv')
data_keyword.head(5)

Unnamed: 0,id,cast_list,character_list,director,title,genres,overview,production_companies,release_date,keyword_list,keyword_list_no_space
0,862,TomHanks TimAllen DonRickles JimVarney Wallace...,Woody(voice) BuzzLightyear(voice) Mr.PotatoHea...,JohnLasseter,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,1990.0,jealousy toy boy friendship friends rivalry bo...,jealousy toy boy friendship friends rivalry bo...
1,8844,RobinWilliams JonathanHyde KirstenDunst Bradle...,AlanParrish SamuelAlanParrish VanPelt JudyShep...,JoeJohnston,Jumanji,Adventure Fantasy Family,When siblings Judy and Peter discover an encha...,TriStar Pictures Teitler Film Interscope Commu...,1990.0,board game disappearance new home recluse gian...,boardgame disappearance newhome recluse gianti...
2,15602,WalterMatthau JackLemmon Ann-MargretAnn-Margre...,MaxGoldman JohnGustafson ArielGustafson MariaS...,HowardDeutch,Grumpier Old Men,Romance Comedy,A family wedding reignites the ancient feud be...,Warner Bros. Lancaster Gate,1990.0,fishing best friend duringcreditsstinger old men,fishing bestfriend duringcreditsstinger oldmen
3,31357,WhitneyHouston AngelaBassett LorettaDevine Lel...,RobinStokes MarvinKing KennethDawkins JohnHarr...,ForestWhitaker,Waiting to Exhale,Comedy Drama Romance,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,1990.0,based on novel interracial relationship single...,basedonnovel interracialrelationship singlemot...
4,11862,SteveMartin DianeKeaton MartinShort KimberlyWi...,GeorgeBanks NinaBanks FranckEggelhoffer AnnieB...,CharlesShyer,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,Sandollar Productions Touchstone Pictures,1990.0,baby midlife crisis confidence aging daughter ...,baby midlifecrisis confidence aging daughter m...


In [None]:
data_keyword.fillna('', inplace = True)

In [None]:
def movie_to_id(df): #특정영화의 무비별 아이디와, 아이디별 무비 딕셔너리를 만듬
  id2movie = {}
  for i, c in enumerate(df['title']): id2movie[i] = c

  # id와 movie title를 매핑할 dictionary를 생성해줍니다. 
  movie2id = {}
  for i, c in id2movie.items(): movie2id[c] = i
  
  return movie2id, id2movie

#movie_id(data)[0] : 영화 제목이 key값이고, 영화 ID가 Value인 딕셔너리
#movie_id(data)[1] : 영화 ID가 key값이고, 영화 제목이 Value인 딕셔너리

In [None]:
movie2id_dict = movie_to_id(data_keyword)[0]
id2movie_dict = movie_to_id(data_keyword)[1]

In [None]:
def tf_idf_cos_sim_matrix(*args, data,matrix_size):
  #args: tf_idf에 학습시킬 column들 (str)
  #data: 학습시킬 데이터 프래임 (pd.DataFrame)
  #matrix_size: 데이터 프레임에서 불러올 Row 갯수 (int)

  merged_data = data[args[0]]
  for i in range(1,len(args)):
     merged_data += " " + data[args[i]]

  merged_data = merged_data.loc[0:matrix_size].reset_index(drop=True)
  tfidf = TfidfVectorizer(stop_words='english')
  tfidf_matrix = tfidf.fit_transform(merged_data)

  return cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def top_k_recommender(movie_name, k, cosine_matrix, movie2id_dictionary, id2movie_dictionary):
    #movie_name : 추천하고자 하는 영화의 제목
    #movie2id_dict : 영화 ID가 key값이고, 영화 제목이 Value인 딕셔너리
    #id2movie_dict : 영화 ID가 key값이고, 영화 제목이 Value인 딕셔너리
    #cosine_matrix : TF-IDF로 학습한 메트릭스
    #k : 추천하고싶은 영화갯수

    idx = movie2id_dictionary[movie_name] # Toy Story : 0번 인덱스 
    sim_scores = [(i, c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬 
    sim_scores = [(id2movie_dict[i], round(score, 4)) for i, score in sim_scores[0:k]]
    return sim_scores # 상위 k개의 인덱스와 유사도를 추출 

In [None]:
def is_movie_in(title):
  pass

def get_movie_id(title):
  pass

def get_movie_title(id):
  pass

# TF-IDF 1 feature Recommend System
- matrix_over : 'overview'
- matrix_cast : 'cast_list'
- matrix_char : 'character_list'

- matrix의 크기가 커 메모리 초기화 주의 !!!!

# 1. Feature : 'overview'

In [None]:
# 'overview'를 feature로 하는 cosine matrix를 생성한다.
matrix_over = tf_idf_cos_sim_matrix('overview', data = data_keyword, matrix_size = 25000) 

In [None]:
#'Toy Story' 유사 Top 10 추천 영화 리스트 추출
top_k_recommender('Toy Story', 10, matrix_over, movie2id_dict,id2movie_dict)  

[('Toy Story 3', 0.5261),
 ('Toy Story 2', 0.4648),
 ('The 40 Year Old Virgin', 0.2751),
 ('Small Fry', 0.2716),
 ("Andy Hardy's Blonde Trouble", 0.2378),
 ('Rebel Without a Cause', 0.1838),
 ('Life Begins for Andy Hardy', 0.1807),
 ("You're Only Young Once", 0.1646),
 ("Andy Hardy's Private Secretary", 0.1623),
 ('Condorman', 0.1572)]

In [None]:
top_k_recommender('Rocky', 10, matrix_over, movie2id_dict,id2movie_dict)  #'Rocky' 유사 Top 10 추천 영화 리스트 추출

[('Creed', 0.4284),
 ('Rocky IV', 0.2788),
 ('Rocky Balboa', 0.2529),
 ('Rocky II', 0.2504),
 ('Rocky V', 0.2455),
 ('Rocky III', 0.2127),
 ('The Prizefighter and the Lady', 0.1754),
 ('Angels with Dirty Faces', 0.174),
 ('Somebody Up There Likes Me', 0.1655),
 ('Cain and Mabel', 0.1633)]

In [None]:
top_k_recommender('Inception', 10, matrix_over, movie2id_dict,id2movie_dict)  #'Inception' 유사 Top 10 추천 영화 리스트 추출

[("The Farmer's Wife", 0.1199),
 ('House', 0.1159),
 ('What Ever Happened to Baby Jane?', 0.1116),
 ('Stone Cold', 0.1029),
 ('Cobb', 0.1021),
 ('Cypher', 0.1008),
 ('Mission: Impossible - Rogue Nation', 0.0985),
 ('Dear Murderer', 0.0971),
 ('Shadow Man', 0.0961),
 ('Crumb', 0.0951)]

In [None]:
top_k_recommender('Minions', 10, matrix_over, movie2id_dict,id2movie_dict)

[('Despicable Me 2', 0.2305),
 ('The Invisible Boy', 0.1444),
 ('Madam Satan', 0.1442),
 ('Soul Assassin', 0.1394),
 ('Sherlock Holmes and the Secret Weapon', 0.1331),
 ('The Scarlet Letter', 0.1318),
 ('10 Items or Less', 0.1254),
 ('Casualties', 0.1227),
 ('Lambada', 0.1223),
 ('Freedom Downtime', 0.1184)]

# 2. Feature : 'cast_list'

In [None]:
matrix_cast = tf_idf_cos_sim_matrix('cast_list', data = data_keyword, matrix_size = 30000) #'cast_list'를 feature로 하는 cosine matrix를 생성한다.

In [None]:
top_k_recommender('Toy Story', 10, matrix_cast, movie2id_dict,id2movie_dict)  #'Toy Story' 유사 Top 10 추천 영화 리스트 추출

[('Toy Story 2', 0.5836),
 ('Toy Story That Time Forgot', 0.5507),
 ('Toy Story of Terror!', 0.5386),
 ('Partysaurus Rex', 0.5326),
 ('Ernest Goes to School', 0.4746),
 ('Dr. Otto and the Riddle of the Gloom Beam', 0.4746),
 ('Chimpanzee', 0.431),
 ('Toy Story 3', 0.367),
 ('Hawaiian Vacation', 0.3315),
 ('Casino', 0.2404)]

In [None]:
top_k_recommender('Rocky', 10, matrix_cast, movie2id_dict,id2movie_dict)  #'Rocky' 유사 Top 10 추천 영화 리스트 추출

[('Rocky II', 1.0),
 ('Rocky III', 1.0),
 ('Rocky IV', 0.7868),
 ('Rocky V', 0.7334),
 ('Rocky Balboa', 0.3324),
 ('Death Hunt', 0.2275),
 ('Toy Story of Terror!', 0.2249),
 ('The Godfather: Part III', 0.218),
 ('Action Jackson', 0.2145),
 ('The Defiant Ones', 0.2111)]

In [None]:
top_k_recommender('Inception', 10, matrix_cast, movie2id_dict,id2movie_dict)  #'Inception' 유사 Top 10 추천 영화 리스트 추출

[('Vanishing of the Bees', 0.4107),
 ('Hubble 3D', 0.3952),
 ('Killshot', 0.3249),
 ('The Revenant', 0.3244),
 ('The Juror', 0.3234),
 ('Sin City: A Dame to Kill For', 0.323),
 ('Looper', 0.3154),
 ('G.I. Joe: The Rise of Cobra', 0.3142),
 ('The Lookout', 0.3139),
 ('50/50', 0.3127)]

In [None]:
top_k_recommender('Minions', 10, matrix_cast, movie2id_dict,id2movie_dict) #'Minions' 유사 Top 10 추천 영화 리스트 추출

[('Binky Nelson Unpacified', 0.3517),
 ('The Congress', 0.2308),
 ('Tropic Thunder', 0.2194),
 ('The Town', 0.2181),
 ('The Ten', 0.2158),
 ('Stolen', 0.2146),
 ('Keeping Up with the Joneses', 0.2124),
 ('Friends with Kids', 0.2119),
 ('The Object of My Affection', 0.2116),
 ('Howl', 0.2112)]

# 3. Feature : 'character_list'

In [None]:
matrix_char = tf_idf_cos_sim_matrix('character_list', data = data_keyword, matrix_size = 30000) #'character_list'를 feature로 하는 cosine matrix를 생성한다.

In [None]:
top_k_recommender('Toy Story', 10, matrix_char, movie2id_dict,id2movie_dict)  #'Toy Story' 유사 Top 10 추천 영화 리스트 추출

[('Toy Story 2', 0.8252),
 ('Hawaiian Vacation', 0.8162),
 ('Toy Story That Time Forgot', 0.7926),
 ('Paradise Lost: The Child Murders at Robin Hood Hills', 0.7165),
 ('Land Without Bread', 0.7165),
 ('Waltz with Bashir', 0.7165),
 ('9', 0.7165),
 ('Rocks', 0.7165),
 ('Halo Legends', 0.7165),
 ('Ballad of the Little Soldier', 0.7165)]

In [None]:
top_k_recommender('Rocky', 10, matrix_char, movie2id_dict,id2movie_dict)  #'Rocky' 유사 Top 10 추천 영화 리스트 추출

[('Rocky III', 0.8208),
 ('Rocky II', 0.8093),
 ('Rocky IV', 0.6108),
 ('Rocky V', 0.6108),
 ('Rocky Balboa', 0.3101),
 ('33 Scenes from Life', 0.2209),
 ('Chocolate City', 0.1756),
 ("April's Shower", 0.1742),
 ("What Doesn't Kill You", 0.1613),
 ('The Pope of Greenwich Village', 0.1573)]

In [None]:
top_k_recommender('Inception', 10, matrix_char, movie2id_dict,id2movie_dict) #'Inception' 유사 Top 10 추천 영화 리스트 추출

[('Seventh Code', 0.211),
 ('Pompeii', 0.2054),
 ('Hercules', 0.2028),
 ('Ghost in the Shell: Stand Alone Complex - The Laughing Man', 0.1799),
 ('Ghost in the Shell Arise - Border 2: Ghost Whispers', 0.1789),
 ('In Their Sleep', 0.1755),
 ('Creep', 0.1564),
 ('Song for Marion', 0.1518),
 ("I'm Not There.", 0.1507),
 ('Bloodknot', 0.1506)]

In [None]:
top_k_recommender('Minions', 10, matrix_char, movie2id_dict,id2movie_dict) #'Minions' 유사 Top 10 추천 영화 리스트 추출

[('Paradise Lost: The Child Murders at Robin Hood Hills', 0.6448),
 ('Land Without Bread', 0.6448),
 ('Waltz with Bashir', 0.6448),
 ('9', 0.6448),
 ('Rocks', 0.6448),
 ('Halo Legends', 0.6448),
 ('Ballad of the Little Soldier', 0.6448),
 ('The Desert of Forbidden Art', 0.6448),
 ('The Painting', 0.6448),
 ('The Old Lady and the Pigeons', 0.6448)]

In [None]:
top_k_recommender('The Avengers', 10, matrix_char, movie2id_dict,id2movie_dict) #'The Avengers' 유사 Top 10 추천 영화 리스트 추출

[('Avengers: Age of Ultron', 0.8815),
 ('Marvel Studios: Assembling a Universe', 0.6116),
 ('Captain America: Civil War', 0.5875),
 ('Ultimate Avengers', 0.5621),
 ('Team Thor', 0.4909),
 ('Captain America: The Winter Soldier', 0.4383),
 ('Iron Man & Captain America: Heroes United', 0.4335),
 ('Ultimate Avengers 2', 0.4),
 ('Iron Man 2', 0.3328),
 ('Iron Man & Hulk: Heroes United', 0.2411)]

# TF-IDF 2, 3 features Recommend System

- Matrix-One: 'genres', 'keyword_list'
- Matrix-Two: 'production_companies','director'
- Matrix-Three: 'cast_list','director','production_companies'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
import pandas as pd
import numpy as np

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#데이터 불러오기
data_keyword = pd.read_csv('/content/drive/MyDrive/4조/코드/Dataset/data_keyword.csv')
data_keyword.head(5)

Unnamed: 0.1,Unnamed: 0,id,cast_list,character_list,director,title,genres,overview,production_companies,release_date,keyword_list,keyword_list_no_space
0,0,862,TomHanks TimAllen DonRickles JimVarney Wallace...,Woody(voice) BuzzLightyear(voice) Mr.PotatoHea...,JohnLasseter,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,1990.0,jealousy toy boy friendship friends rivalry bo...,jealousy toy boy friendship friends rivalry bo...
1,1,8844,RobinWilliams JonathanHyde KirstenDunst Bradle...,AlanParrish SamuelAlanParrish VanPelt JudyShep...,JoeJohnston,Jumanji,Adventure Fantasy Family,When siblings Judy and Peter discover an encha...,TriStar Pictures Teitler Film Interscope Commu...,1990.0,board game disappearance new home recluse gian...,boardgame disappearance newhome recluse gianti...
2,2,15602,WalterMatthau JackLemmon Ann-MargretAnn-Margre...,MaxGoldman JohnGustafson ArielGustafson MariaS...,HowardDeutch,Grumpier Old Men,Romance Comedy,A family wedding reignites the ancient feud be...,Warner Bros. Lancaster Gate,1990.0,fishing best friend duringcreditsstinger old men,fishing bestfriend duringcreditsstinger oldmen
3,3,31357,WhitneyHouston AngelaBassett LorettaDevine Lel...,RobinStokes MarvinKing KennethDawkins JohnHarr...,ForestWhitaker,Waiting to Exhale,Comedy Drama Romance,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,1990.0,based on novel interracial relationship single...,basedonnovel interracialrelationship singlemot...
4,4,11862,SteveMartin DianeKeaton MartinShort KimberlyWi...,GeorgeBanks NinaBanks FranckEggelhoffer AnnieB...,CharlesShyer,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,Sandollar Productions Touchstone Pictures,1990.0,baby midlife crisis confidence aging daughter ...,baby midlifecrisis confidence aging daughter m...


In [None]:
data_keyword.fillna('', inplace = True)

In [None]:
def movie_to_id(df): #특정영화의 무비별 아이디와, 아이디별 무비 딕셔너리를 만듬
  id2movie = {}
  for i, c in enumerate(df['title']): id2movie[i] = c

  # id와 movie title를 매핑할 dictionary를 생성해줍니다. 
  movie2id = {}
  for i, c in id2movie.items(): movie2id[c] = i
  
  return movie2id, id2movie

#movie_id(data)[0] : 영화 제목이 key값이고, 영화 ID가 Value인 딕셔너리
#movie_id(data)[1] : 영화 ID가 key값이고, 영화 제목이 Value인 딕셔너리

In [None]:
movie2id_dict = movie_to_id(data_keyword)[0]
id2movie_dict = movie_to_id(data_keyword)[1]

In [None]:
def tf_idf_cos_sim_matrix(*args, data,matrix_size):
  #args: tf_idf에 학습시킬 column들 (str)
  #data: 학습시킬 데이터 프래임 (pd.DataFrame)
  #matrix_size: 데이터 프레임에서 불러올 Row 갯수 (int)

  merged_data = data[args[0]]
  for i in range(1,len(args)):
     merged_data += " " + data[args[i]]

  merged_data = merged_data.loc[0:matrix_size].reset_index(drop=True)
  tfidf = TfidfVectorizer(stop_words='english')
  tfidf_matrix = tfidf.fit_transform(merged_data)

  return cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def top_k_recommender(movie_name, k, cosine_matrix, movie2id_dictionary, id2movie_dictionary):
    #movie_name : 추천하고자 하는 영화의 제목
    #movie2id_dict : 영화 ID가 key값이고, 영화 제목이 Value인 딕셔너리
    #id2movie_dict : 영화 ID가 key값이고, 영화 제목이 Value인 딕셔너리
    #cosine_matrix : TF-IDF로 학습한 메트릭스
    #k : 추천하고싶은 영화갯수

    idx = movie2id_dictionary[movie_name] # Toy Story : 0번 인덱스 
    sim_scores = [(i, c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬 
    sim_scores = [(id2movie_dict[i], round(score, 4)) for i, score in sim_scores[0:k]]
    return sim_scores # 상위 k개의 인덱스와 유사도를 추출 

## 1. Feature: 'genres', 'keyword_list'

In [None]:
matrix_one = tf_idf_cos_sim_matrix('genres', 'keyword_list', data = data_keyword, matrix_size = 5000) #'genres', 'keyword_list'를 feature로 하는 cosine matrix를 생성한다.

In [None]:
top_k_recommender('Toy Story', 10, matrix_one, movie2id_dict,id2movie_dict) #'Toy Story' 유사 Top 10 추천 영화 리스트 추출

[('Small Soldiers', 0.5631),
 ('Toys', 0.4537),
 ('The Transformers: The Movie', 0.4494),
 ("Child's Play", 0.4481),
 ("Child's Play 2", 0.4052),
 ('The Indian in the Cupboard', 0.3862),
 ('Home Alone 3', 0.382),
 ('Pinocchio', 0.3533),
 ('Toy Story 2', 0.2943),
 ("Child's Play 3", 0.2712)]

In [None]:
top_k_recommender('Rocky', 10, matrix_one, movie2id_dict,id2movie_dict) #'Rocky' 유사 Top 10 추천 영화 리스트 추출

[('Rocky II', 0.5408),
 ('Rocky III', 0.4206),
 ('Rocky V', 0.381),
 ('The Hurricane', 0.326),
 ('Rocky IV', 0.3106),
 ('The Champ', 0.3075),
 ('Ali', 0.215),
 ('Body and Soul', 0.2109),
 ('They Made Me a Criminal', 0.2008),
 ('Snake Eyes', 0.1964)]

## 2. Feature: 'production_companies', 'director'

In [None]:
matrix_two = tf_idf_cos_sim_matrix('production_companies','director', data = data_keyword, matrix_size = 5000) #'production_companies','director' 를 feature로 하는 cosine matrix를 생성한다.

In [None]:
top_k_recommender('Toy Story', 10, matrix_two, movie2id_dict,id2movie_dict) #'Toy Story' 유사 Top 10 추천 영화 리스트 추출

[('Toy Story 2', 1.0),
 ("A Bug's Life", 0.8992),
 ('Monsters, Inc.', 0.5756),
 ('The Swan Princess', 0.3262),
 ('Return to Never Land', 0.2657),
 ('Jimmy Neutron: Boy Genius', 0.2542),
 ('Titan A.E.', 0.2444),
 ('The Powerpuff Girls Movie', 0.2394),
 ('Beauty and the Beast', 0.2199),
 ('Ice Age', 0.2168)]

In [None]:
top_k_recommender('Rocky', 10, matrix_two, movie2id_dict,id2movie_dict) #'Rocky' 유사 Top 10 추천 영화 리스트 추출

[('Rocky V', 1.0),
 ('Joe', 0.7323),
 ('Neighbors', 0.6288),
 ('For Keeps', 0.5954),
 ('8 Seconds', 0.5053),
 ('Annie Hall', 0.4948),
 ('Everything You Always Wanted to Know About Sex *But Were Afraid to Ask',
  0.4948),
 ('Interiors', 0.4948),
 ('Love and Death', 0.4948),
 ('Carrie', 0.4741)]

## 3. Feature: 'cast_list', 'director', 'production_companies'

In [None]:
matrix_three = tf_idf_cos_sim_matrix('cast_list','director','production_companies', data = data_keyword, matrix_size = 5000) #'cast_list','director','production_companies' 를 feature로 하는 cosine matrix를 생성한다.

In [None]:
top_k_recommender('Toy Story', 10, matrix_three, movie2id_dict,id2movie_dict) #'Toy Story' 유사 Top 10 추천 영화 리스트 추출

[('Toy Story 2', 0.8251),
 ("A Bug's Life", 0.5401),
 ('Monsters, Inc.', 0.1818),
 ('That Thing You Do!', 0.1513),
 ('Return to Never Land', 0.1193),
 ('My Favorite Martian', 0.0989),
 ('The Beverly Hillbillies', 0.0941),
 ('The Swan Princess', 0.0914),
 ('Simon', 0.0908),
 ('Titan A.E.', 0.0894)]

In [None]:
top_k_recommender('Rocky', 10, matrix_three, movie2id_dict,id2movie_dict) #'Rocky' 유사 Top 10 추천 영화 리스트 추출

[('Rocky V', 0.8673),
 ('Rocky II', 0.6788),
 ('Rocky III', 0.6788),
 ('Rocky IV', 0.579),
 ('Joe', 0.3952),
 ('Neighbors', 0.3903),
 ('For Keeps', 0.3825),
 ('8 Seconds', 0.3737),
 ('The Karate Kid, Part II', 0.364),
 ('The Karate Kid, Part III', 0.354)]