In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import contractions
import json

In [2]:
movieDF = pd.read_csv('../data/processed/tmdb_4796.csv')

In [3]:
movieDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4796 entries, 0 to 4795
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4796 non-null   int64  
 1   title                 4796 non-null   object 
 2   release_date          4796 non-null   object 
 3   original_language     4796 non-null   object 
 4   overview              4795 non-null   object 
 5   cast                  4796 non-null   object 
 6   crew                  4796 non-null   object 
 7   production_countries  4707 non-null   object 
 8   keywords              4523 non-null   object 
 9   tagline               4222 non-null   object 
 10  genres                4789 non-null   object 
 11  vote_average          4796 non-null   float64
 12  vote_count            4796 non-null   int64  
 13  popularity            4796 non-null   float64
 14  runtime               4796 non-null   int64  
 15  backdrop_path        

In [4]:
movieDF.head(2)

Unnamed: 0,id,title,release_date,original_language,overview,cast,crew,production_countries,keywords,tagline,genres,vote_average,vote_count,popularity,runtime,backdrop_path,poster_path,homepage
0,19995,Avatar,2009-12-15,en,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","United States of America, United Kingdom","future, society, culture clash, space travel, ...",Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction",7.573,29815,79.932,162,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,https://www.avatar.com/movies/avatar
1,285,Pirates of the Caribbean: At World's End,2007-05-19,en,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",United States of America,"east india company, exotic island, strong woma...","At the end of the world, the adventure begins.","Adventure, Fantasy, Action",7.239,13345,81.883,169,/kPcHuPYqzkSo4bmPHtH82GaeEgX.jpg,/jGWpG4YhpQwVmjyHEGkxEkeRf0S.jpg,https://movies.disney.com/pirates-of-the-carib...


In [5]:
### keywords, tagline 에 결측치가 많음
movieDF.isnull().sum()

id                         0
title                      0
release_date               0
original_language          0
overview                   1
cast                       0
crew                       0
production_countries      89
keywords                 273
tagline                  574
genres                     7
vote_average               0
vote_count                 0
popularity                 0
runtime                    0
backdrop_path            181
poster_path               25
homepage                3024
dtype: int64

In [6]:
movieDF[movieDF['genres'].isnull()]

Unnamed: 0,id,title,release_date,original_language,overview,cast,crew,production_countries,keywords,tagline,genres,vote_average,vote_count,popularity,runtime,backdrop_path,poster_path,homepage
4307,137955,Crowsnest,2012-09-07,en,"In late summer of 2011, five young friends on ...",[],[],Canada,found footage,,,4.984,31,3.659,84,,/g2swX9YGaIq1dpGr7gva8DGnWe4.jpg,
4406,335874,Childless,2015-04-03,en,Katherine is a typical teenager. Today's her f...,"[{""cast_id"": 1, ""character"": """", ""credit_id"": ...","[{""credit_id"": ""55575bb49251411e62005cfd"", ""de...",United States of America,,,,4.5,2,0.939,90,/wzP9z4BCdb6sNkntTQ85688DVXw.jpg,/A3xnS8K4II7puaFO8Cwc5MubuRU.jpg,http://www.sealionfilms.com/childless
4497,331493,Light from the Darkroom,2014-01-01,en,Light in the Darkroom is the story of two best...,[],[],"Panama, United States of America","faith, massacre",,,0.0,0,1.192,90,/exRCs4v31yh5CENBUo45g4fa9eX.jpg,/1bWVwd3MIgMhLgAVKEPptzms29i.jpg,
4564,328307,Rise of the Entrepreneur: The Search for a Bet...,2014-11-20,en,The world is changing faster than ever. Techno...,[],[],,,,,8.0,1,0.6,0,,,
4650,320435,UnDivided,2013-02-01,en,UnDivided documents the true story of how a su...,[],[],United States of America,,,,0.0,0,0.6,0,,,
4667,194588,Short Cut to Nirvana: Kumbh Mela,2004-05-11,en,Every 12 years over 70 million pilgrims gather...,[],[],,,,,0.0,0,0.6,85,,,
4709,38786,The Blood of My Brother: A Story of Death in Iraq,2005-11-25,en,THE BLOOD OF MY BROTHER goes behind the scenes...,[],[],,,,,5.7,3,0.6,90,,,


### (1) 줄거리(overview) 기반 코사인 유사도 행렬 생성

##### 결측치 전처리

In [7]:
### overview 컬럼의 결측치 유무 조사 (1개 존재)
movieDF['overview'].isnull().sum()

1

In [8]:
### 결측값을 빈 문자열 값으로 대체
movieDF['overview'] = movieDF['overview'].fillna('')

In [9]:
### 결측치 제거 확인
movieDF['overview'].isnull().sum()

0

##### 영어 축약어 전처리

In [10]:
### 예시 문장
movieDF.loc[6, 'overview']

"When the kingdom's most wanted-and most charming-bandit Flynn Rider hides out in a mysterious tower, he's taken hostage by Rapunzel, a beautiful and feisty tower-bound teen with 70 feet of magical, golden hair. Flynn's curious captor, who's looking for her ticket out of the tower where she's been locked away for years, strikes a deal with the handsome thief and the unlikely duo sets off on an action-packed escapade, complete with a super-cop horse, an over-protective chameleon and a gruff gang of pub thugs."

In [11]:
### contractions 라이브러리를 사용해 축약어 확장 실행
movieDF['overview'] = movieDF['overview'].apply(lambda x: contractions.fix(x))

In [12]:
### 전처리 결과 확인
movieDF.loc[6, 'overview']

"When the kingdom's most wanted-and most charming-bandit Flynn Rider hides out in a mysterious tower, he is taken hostage by Rapunzel, a beautiful and feisty tower-bound teen with 70 feet of magical, golden hair. Flynn's curious captor, who is looking for her ticket out of the tower where she is been locked away for years, strikes a deal with the handsome thief and the unlikely duo sets off on an action-packed escapade, complete with a super-cop horse, an over-protective chameleon and a gruff gang of pub thugs."

In [13]:
### TF-IDF 행렬 생성 (=> 19823 단어 존재)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movieDF['overview']) # scipy.sparse._csr.csr_matrix
print(f'TF-IDF 행렬의 크기(shape) : {tfidf_matrix.shape}')

TF-IDF 행렬의 크기(shape) : (4796, 19823)


In [14]:
### 메모리 부족 주의 !!! (MemoryError 주의)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)  # ndarray
print(f'코사인 유사도 연산 결과 : {cosine_sim.shape}')

코사인 유사도 연산 결과 : (4796, 4796)


In [15]:
### 영화제목 -> 해당 인덱스 (딕셔너리)
title_to_index = dict(zip(movieDF['title'], movieDF.index))

In [16]:
def get_recommendations(title, cosine_sim, get_indices=False):
    ### 선택한 영화의 제목으로부터 해당 영화의 인덱스를 받아온다.
    idx = title_to_index[title]
    
    ### 해당 영화와 모든 영화와의 유사도를 가져온다.
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    ### 유사도에 따라 영화들을 정렬한다.
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    
    ### 가장 유사한 10개의 영화를 추출한다.
    sim_scores = sim_scores[1:11]
    
    ### 가장 유사한 10개 영화의 인덱스를 얻는다.
    movie_indices = [x[0] for x in sim_scores]
    
    if get_indices:
        ### 가장 유사한 10개 영화의 인덱스를 반환한다.
        return movie_indices
    else: 
        ### 가장 유사한 10개 영화의 제목을 리턴한다.
        return movieDF['title'].iloc[movie_indices]

In [17]:
get_recommendations('The Dark Knight Rises', cosine_sim)

65                              The Dark Knight
210                              Batman & Robin
299                              Batman Forever
1359                                     Batman
3848    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
1349                      Ghosts of Mississippi
Name: title, dtype: object

In [18]:
### 코사인 유사도 행렬(ndarray)을 csv 파일로 저장 (548MB)
# np.savetxt('../data/processed/cosine_similarity1_4796.csv', cosine_sim, delimiter=',')

In [22]:
### 코사인 유사도 행렬(ndarray)을 npy 파일로 저장 (175MB)
np.save('../data/processed/cosine_similarity1_4796.npy', cosine_sim)

### (2) 출연진, 연출진 기반 코사인 유사도 행렬 생성
- 일반적으로 고려하는 대상 : 출연진(최대 3명), 감독(모두), 각본가(모두), 제작자(최대 3명)
- 출연진 정보는 중요도 순으로 저장되어 있음

In [23]:
### 각 딕셔너리 리스트를 DataFrame으로 바꾸는 방법 (ast.literal_eval 함수도 비슷한 기능)
def str_to_df(input_str):
    """ 문자열으로 저장된 딕셔너리 리스트를 DataFrame으로 변환한 결과를 반환 """
    dict_list = json.loads(input_str)   # 속성명이 큰따옴표("")로 둘러싸여 있을 때만 작동
    castDF = pd.DataFrame(dict_list)
    return castDF

In [24]:
### 출연진 정보 예시
str_to_df(movieDF.loc[0, 'cast'])

Unnamed: 0,cast_id,character,credit_id,gender,id,name,order
0,242,Jake Sully,5602a8a7c3a3685532001c9a,2,65731,Sam Worthington,0
1,3,Neytiri,52fe48009251416c750ac9cb,1,8691,Zoe Saldana,1
2,25,Dr. Grace Augustine,52fe48009251416c750aca39,1,10205,Sigourney Weaver,2
3,4,Col. Quaritch,52fe48009251416c750ac9cf,2,32747,Stephen Lang,3
4,5,Trudy Chacon,52fe48009251416c750ac9d3,1,17647,Michelle Rodriguez,4
...,...,...,...,...,...,...,...
78,98,Female Marine (uncredited),52fe48019251416c750acb5f,0,1207291,Jodie Taylor,78
79,99,Ikran Clan Leader (uncredited),52fe48019251416c750acb63,1,1186027,Alicia Vela-Bailey,79
80,100,Geologist (uncredited),52fe48019251416c750acb67,0,1207292,Richard Whiteside,80
81,101,Na'vi (uncredited),52fe48019251416c750acb6b,0,103259,Nikie Zambo,81


In [25]:
### cast 정보가 3명도 안 되는 영화가 있는지 확인 (=> 꽤 존재)
for i in range(4796):
    s = str_to_df(movieDF.loc[i, 'cast']).shape
    if s[0] < 3:
        print(f'{i} 번째 cast shape : {s}')

1730 번째 cast shape : (2, 7)
2601 번째 cast shape : (0, 0)
3222 번째 cast shape : (1, 7)
3388 번째 cast shape : (2, 7)
3467 번째 cast shape : (2, 7)
3665 번째 cast shape : (0, 0)
3763 번째 cast shape : (1, 7)
3891 번째 cast shape : (1, 7)
3985 번째 cast shape : (0, 0)
4002 번째 cast shape : (0, 0)
4061 번째 cast shape : (0, 0)
4098 번째 cast shape : (2, 7)
4111 번째 cast shape : (0, 0)
4133 번째 cast shape : (1, 7)
4240 번째 cast shape : (0, 0)
4298 번째 cast shape : (0, 0)
4307 번째 cast shape : (0, 0)
4315 번째 cast shape : (0, 0)
4321 번째 cast shape : (1, 7)
4380 번째 cast shape : (2, 7)
4394 번째 cast shape : (0, 0)
4424 번째 cast shape : (0, 0)
4451 번째 cast shape : (0, 0)
4461 번째 cast shape : (1, 7)
4464 번째 cast shape : (2, 7)
4484 번째 cast shape : (0, 0)
4497 번째 cast shape : (0, 0)
4501 번째 cast shape : (0, 0)
4510 번째 cast shape : (0, 0)
4519 번째 cast shape : (1, 7)
4533 번째 cast shape : (2, 7)
4543 번째 cast shape : (0, 0)
4546 번째 cast shape : (0, 0)
4555 번째 cast shape : (0, 0)
4557 번째 cast shape : (0, 0)
4559 번째 cast shape :

In [26]:
def get_casts(x: pd.Series):
    ### x는 movieDF의 한 행(Series)
    if not isinstance(x, pd.Series):
        return ""
    
    castDF = str_to_df(x['cast'])
    castnum = castDF.shape[0]
    
    if castnum >= 3:
        nameList = [name.lower() for name in castDF['name'].head(3)]
        nameList = [name.replace(" ", "") for name in nameList]
        return " ".join(nameList)
    elif castnum > 0:
        nameList = [name.lower() for name in castDF['name']]
        nameList = [name.replace(" ", "") for name in nameList]
        return " ".join(nameList)
    return ""

In [27]:
get_casts(movieDF.iloc[0])

'samworthington zoesaldana sigourneyweaver'

In [28]:
get_casts(movieDF.iloc[1730])

'philippelabro jacquesperrin'

In [29]:
### 연출진 정보 예시
str_to_df(movieDF.loc[0, 'crew'])

Unnamed: 0,credit_id,department,gender,id,job,name
0,52fe48009251416c750aca23,Editing,0,1721,Editor,Stephen E. Rivkin
1,539c47ecc3a36810e3001f87,Art,2,496,Production Design,Rick Carter
2,54491c89c3a3680fb4001cf7,Sound,0,900,Sound Designer,Christopher Boyes
3,54491cb70e0a267480001bd0,Sound,0,900,Supervising Sound Editor,Christopher Boyes
4,539c4a4cc3a36810c9002101,Production,1,1262,Casting,Mali Finn
...,...,...,...,...,...,...
148,5592b2c3c3a36869e800003c,Crew,0,1483231,CG Supervisor,Philippe Rebours
149,5592b317c3a36877470012af,Crew,0,1483232,CG Supervisor,Michael Takarangi
150,5592b345c3a36877470012bb,Crew,0,1483233,CG Supervisor,David Weitzberg
151,5592b37cc3a368775100113b,Crew,0,1483234,CG Supervisor,Ben White


In [30]:
### 단계 : writer, script -> screenplay
### 추출대상 : Director, Writer, Screenplay, Producer, Executive Producer
### (Director of Photography, Original Music Composer, Production Design)
temp = str_to_df(movieDF.loc[0, 'crew'])
temp['job'].unique()

array(['Editor', 'Production Design', 'Sound Designer',
       'Supervising Sound Editor', 'Casting', 'Original Music Composer',
       'Director', 'Writer', 'Producer', 'Screenplay', 'Art Direction',
       'Visual Effects Producer', 'Supervising Art Director',
       'Music Editor', 'Sound Effects Editor', 'Foley', 'Costume Design',
       'Set Decoration', 'Set Designer', 'Executive Producer',
       'Director of Photography', 'Stunts', 'Makeup Artist',
       'Hairstylist', 'Camera Operator', 'Visual Effects Supervisor',
       'Visual Effects Editor', 'Dialect Coach', 'Motion Capture Artist',
       'Stunt Coordinator', 'Steadicam Operator',
       'Makeup Department Head', 'Post Production Supervisor',
       'Costume Supervisor', 'Dialogue Editor',
       'Transportation Coordinator', 'Art Department Coordinator',
       'Assistant Art Director', 'Construction Coordinator',
       'Sound Re-Recording Mixer', 'Choreographer', 'CG Supervisor',
       'Digital Intermediate', 'Produ

In [31]:
### crew 정보가 3명도 안 되는 영화가 있는지 확인 (=> 꽤 존재)
for i in range(4796):
    s = str_to_df(movieDF.loc[i, 'crew']).shape
    if s[0] < 3:
        print(f'{i} 번째 crew shape : {s}')

761 번째 crew shape : (2, 6)
967 번째 crew shape : (2, 6)
1011 번째 crew shape : (1, 6)
1128 번째 crew shape : (2, 6)
1247 번째 crew shape : (2, 6)
1306 번째 crew shape : (2, 6)
1349 번째 crew shape : (2, 6)
1360 번째 crew shape : (2, 6)
1403 번째 crew shape : (2, 6)
1600 번째 crew shape : (2, 6)
1624 번째 crew shape : (1, 6)
1636 번째 crew shape : (2, 6)
1706 번째 crew shape : (2, 6)
1721 번째 crew shape : (2, 6)
1734 번째 crew shape : (2, 6)
1746 번째 crew shape : (2, 6)
1747 번째 crew shape : (2, 6)
1760 번째 crew shape : (2, 6)
1797 번째 crew shape : (2, 6)
1807 번째 crew shape : (2, 6)
1863 번째 crew shape : (2, 6)
1937 번째 crew shape : (1, 6)
1939 번째 crew shape : (2, 6)
2001 번째 crew shape : (2, 6)
2005 번째 crew shape : (1, 6)
2020 번째 crew shape : (2, 6)
2044 번째 crew shape : (1, 6)
2058 번째 crew shape : (2, 6)
2095 번째 crew shape : (2, 6)
2107 번째 crew shape : (2, 6)
2173 번째 crew shape : (2, 6)
2174 번째 crew shape : (1, 6)
2176 번째 crew shape : (2, 6)
2191 번째 crew shape : (2, 6)
2209 번째 crew shape : (2, 6)
2222 번째 crew shape : (

In [32]:
def get_crews(x: pd.Series):
    ## x는 movieDF의 한 행(Series)
    if not isinstance(x, pd.Series):
        return ""
    
    crewDF = str_to_df(x['crew'])
    if crewDF.shape[0] == 0:
        ## 연출진 정보가 없다면 빈 문자열 반환
        return ""
    
    mask1 = crewDF['job'] == 'Director'
    directorList = [director for director in crewDF[mask1].name]
    
    mask2 = crewDF['job'].isin(['Writer', 'Producer', 'Screenplay', 'Executive Producer'])
    mask3 = crewDF['name'].isin(directorList)
    crewDF = crewDF[mask2 & ~mask3]             # 감독을 제외한 나머지 연출진들
    crewDF.drop_duplicates(subset=['name'], inplace=True)   # 중복된 인물 제거
    
    crew_num = len(directorList) + crewDF.shape[0]
    
    if crew_num <= 3:
        ## 다 합쳐서 3명 이하일 경우 전체 반환
        directorList = [director.lower().replace(" ", "") for director in directorList]
        crewList = [name.lower().replace(" ", "") for name in crewDF['name']]
        return " ".join(directorList + crewList)
    
    ## 다 합쳐서 3명 보다 많을 경우 (감독 + 작가 1~2명 + 제작자 1~2명)
    crewDF.drop_duplicates(subset=['job'], inplace=True)
    directorList = [director.lower().replace(" ", "") for director in directorList]
    crewList = [name.lower().replace(" ", "") for name in crewDF['name']]
    return " ".join(directorList + crewList)

In [33]:
str_to_df(movieDF.loc[4661, 'crew'])

Unnamed: 0,credit_id,department,gender,id,job,name
0,52fe47fbc3a368484e0e3c1f,Directing,0,558559,Director,Al Silliman Jr.


In [34]:
get_crews(movieDF.iloc[4661])

'alsillimanjr.'

In [35]:
### 주요 출연진, 연출진 목록 저장
movieDF['credit'] = movieDF.apply(lambda x: get_casts(x) + ' ' + get_crews(x), axis=1)

In [36]:
movieDF.loc[3, 'credit']

'christianbale michaelcaine garyoldman christophernolan charlesroven jonathannolan michaeluslan'

In [37]:
### TF-IDF 행렬 생성 (=> 12642 단어[=인물명] 존재)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix2 = tfidf.fit_transform(movieDF['credit']) # scipy.sparse._csr.csr_matrix
print(f'TF-IDF 행렬의 크기(shape) : {tfidf_matrix2.shape}')

TF-IDF 행렬의 크기(shape) : (4796, 12642)


In [38]:
### 메모리 부족 주의 !!! (MemoryError 주의)
cosine_sim2 = cosine_similarity(tfidf_matrix2, tfidf_matrix2)  # ndarray
print(f'코사인 유사도 연산 결과 : {cosine_sim2.shape}')

코사인 유사도 연산 결과 : (4796, 4796)


In [39]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

65                         The Dark Knight
1196                          The Prestige
119                          Batman Begins
95                            Interstellar
9       Batman v Superman: Dawn of Justice
1149                       American Hustle
3171                         The Contender
1600                      Secondhand Lions
60                       A Christmas Carol
299                         Batman Forever
Name: title, dtype: object

In [40]:
get_recommendations('The Dark Knight Rises', cosine_sim2, True)

[65, 1196, 119, 95, 9, 1149, 3171, 1600, 60, 299]

In [41]:
### 코사인 유사도 행렬(ndarray)을 csv 파일로 저장 (548MB)
# np.savetxt('../data/processed/cosine_similarity2_4796.csv', cosine_sim2, delimiter=',')

In [42]:
### 코사인 유사도 행렬(ndarray)을 npy 파일로 저장 (175MB)
np.save('../data/processed/cosine_similarity2_4796.npy', cosine_sim2)