In [3]:
# Surprise 설치
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 27.5 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619408 sha256=1c6ae3e413e62154327824d9bf8e342764e27f3f3d0eef0b6382b00f79b5b2d8
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [5]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, accuracy

import warnings; warnings.simplefilter('ignore')

In [6]:
# 단순 추천 시스템(Simple Recommender)

In [7]:
md=pd.read_csv("./drive/MyDrive/data-files/movies_metadata.csv")
print(md.shape)
md.head(1)

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [8]:
# null 값 체크
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [9]:
# 컬럼 길이 100으로 세팅
pd.set_option('max_colwidth', 100)
md[['genres']][:1]

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"


In [10]:
# apply()에 literal_eval 함수를 적용해 문자열을 객체로 변경
md['genres']=md['genres'].apply(literal_eval)
md.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [11]:
# apply lambda를 이용하여 리스트 내 여러 개의 딕셔너리의 'name' 키 찾아 리스트 객체로 변환.
md['genres']=md['genres'].apply(lambda x : [ y['name'] for y in x])
md[['genres']][:1]

Unnamed: 0,genres
0,"[Animation, Comedy, Family]"


In [12]:
md[['genres']]

Unnamed: 0,genres
0,"[Animation, Comedy, Family]"
1,"[Adventure, Fantasy, Family]"
2,"[Romance, Comedy]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]
...,...
45461,"[Drama, Family]"
45462,[Drama]
45463,"[Action, Drama, Thriller]"
45464,[]


In [13]:
print('vote ::: \n', md[['vote_count', 'vote_average']].head())
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

vote ::: 
    vote_count  vote_average
0      5415.0           7.7
1      2413.0           6.9
2        92.0           6.5
3        34.0           6.1
4       173.0           5.7


5.244896612406511

In [14]:
# 총 45460개의 영화 중 상위 5%는 2273번째
print(vote_counts.sort_values(ascending=False)[2273:2274])

# quantile는 데이터를 크기대로 정렬하였을 때 분위수를 구하는 함수. quantile(0.95)는 상위 5%에 해당하는 값을 찾는 것
m = vote_counts.quantile(0.95)
m

11561    434
Name: vote_count, dtype: int64


434.0

In [15]:
print('release_date ::: \n', md['release_date'].head())

# pd.to_datetime
# errors : {‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’
# If ‘raise’, then invalid parsing will raise an exception
# If ‘coerce’, then invalid parsing will be set as NaT
# If ‘ignore’, then invalid parsing will return the input

# 'release_date'를 split해서 year만 추출
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

print('year ::: \n', md['year'].head())

release_date ::: 
 0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object
year ::: 
 0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object


In [16]:
# 평가 수가 상위 5%인(434보다 큰) 데이터 추출
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [17]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [18]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [19]:
# Weighted Rating 상위 250개의 영화 
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [20]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, Adventure]",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [21]:
# stack() : stack이 (위에서 아래로 길게, 높게) 쌓는 것이면, unstack은 쌓은 것을 옆으로 늘어놓는것(왼쪽에서 오른쪽으로 넓게) 라고 연상이 될 것
# reset_index() : 기존의 행 인덱스를 제거하고 인덱스를 데이터 열로 추가
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
print(s.head(10))

gen_md = md.drop('genres', axis=1).join(s)
print(gen_md.head(10))

0    Animation
0       Comedy
0       Family
1    Adventure
1      Fantasy
1       Family
2      Romance
2       Comedy
3       Comedy
3        Drama
Name: genre, dtype: object
   adult  ...      genre
0  False  ...  Animation
0  False  ...     Comedy
0  False  ...     Family
1  False  ...  Adventure
1  False  ...    Fantasy
1  False  ...     Family
2  False  ...    Romance
2  False  ...     Comedy
3  False  ...     Comedy
3  False  ...      Drama

[10 rows x 25 columns]


In [22]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','year','vote_count','vote_average','popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [23]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [None]:
# 콘텐츠 기반 추천(Content Based Recommender)

In [24]:
links_small = pd.read_csv('./drive/MyDrive/data-files/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small.head()

0      862
1     8844
2    15602
3    31357
4    11862
Name: tmdbId, dtype: int64

In [25]:
# Drop a row by index : 19730, 29503, 33587 행은 이상한 데이터들(md.iloc[19730], md.iloc[29503], md.iloc[33587])
md = md.drop([19730, 29503, 35587])

In [26]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

In [30]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [31]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

smd['description'].head()

0    Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...
1    When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...
2    A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John...
3    Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusi...
4    Just when George Banks has recovered from his daughter's wedding, he receives the news that she'...
Name: description, dtype: object

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [None]:
print(tfidf_matrix[10])

  (0, 237880)	0.14118434571637967
  (0, 264587)	0.15239448296082492
  (0, 149397)	0.1346268358004416
  (0, 184592)	0.13757548758984955
  (0, 192607)	0.15239448296082492
  (0, 12515)	0.15239448296082492
  (0, 59410)	0.15239448296082492
  (0, 204261)	0.15239448296082492
  (0, 260369)	0.15239448296082492
  (0, 221301)	0.15239448296082492
  (0, 51585)	0.15239448296082492
  (0, 15732)	0.15239448296082492
  (0, 213978)	0.15239448296082492
  (0, 142130)	0.15239448296082492
  (0, 256956)	0.14583697304488685
  (0, 255009)	0.15239448296082492
  (0, 73255)	0.15239448296082492
  (0, 232769)	0.15239448296082492
  (0, 51809)	0.15239448296082492
  (0, 255952)	0.15239448296082492
  (0, 154810)	0.14583697304488685
  (0, 184597)	0.14583697304488685
  (0, 264425)	0.13213378623513553
  (0, 213982)	0.15239448296082492
  (0, 10509)	0.15239448296082492
  :	:
  (0, 259727)	0.15239448296082492
  (0, 237762)	0.08073572734141646
  (0, 192601)	0.13213378623513553
  (0, 12513)	0.13757548758984955
  (0, 59409)	0.14

In [None]:
tfidf_matrix.shape

(9099, 268124)

In [None]:
# linear_kernel는 두 벡터의 dot product 이다.
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

print(titles.head(), indices.head())

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64


In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [None]:
# The Dark Knight
get_recommendations('Inception').head(10)

5239                              Cypher
141                                Crumb
6398                         Renaissance
653                            Lone Star
1703                               House
4739                    The Pink Panther
319                                 Cobb
2828    What Ever Happened to Baby Jane?
8867                     Pitch Perfect 2
979          Once Upon a Time in America
Name: title, dtype: object

In [None]:
credits = pd.read_csv('./drive/MyDrive/data-files/credits.csv')
keywords = pd.read_csv('./drive/MyDrive/data-files/keywords.csv')

In [None]:
credits['crew'][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [None]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [None]:
md.shape

(45463, 25)

In [None]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [None]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [None]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
smd['director'] = smd['crew'].apply(get_director)

In [None]:
# 출연진 중 상위에 노출되는 3명만 추출
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)

In [None]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
# 출연진의 이름에서 공백 삭제
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
# 감독의 이름에서 공백 삭제 및 3번 언급?
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [None]:
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [None]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [None]:
# 2번 이상 등장한 키워드만 추출
s = s[s > 1]

In [None]:
# 어근 추출을 통해 동일 의미&다른 형태의 단어(dogs&dog, imaging&image 등)를 동일한 단어로 인식
stemmer = SnowballStemmer('english')
print("dogs의 어근 : ", stemmer.stem('dogs'))
print("dog의 어근 : ", stemmer.stem('dog'))

dogs의 어근 :  dog
dog의 어근 :  dog


In [None]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [None]:
# 키워드의 어근을 찾아서 공백 제거 후 세팅
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [None]:
count = CountVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [None]:
get_recommendations('Mean Girls').head(10)

3319               Head Over Heels
4763                 Freaky Friday
1329              The House of Yes
6277              Just Like Heaven
7905         Mr. Popper's Penguins
7332    Ghosts of Girlfriends Past
6959     The Spiderwick Chronicles
8883                      The DUFF
6698         It's a Boy Girl Thing
7377       I Love You, Beth Cooper
Name: title, dtype: object

In [None]:
def improved_recommendations(title):
    print(title)
    idx = indices[title]
    print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)

    movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year']]
#     print(movies)
    
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull())]
#     print(qualified)
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    print(qualified)
    return qualified

In [None]:
improved_recommendations('The Dark Knight')

The Dark Knight
6981
[8031, 6218, 6623, 2085, 7648, 4145, 3381, 8613, 7659, 1134, 8927, 5943, 1260, 9024, 4021, 5809, 7362, 7561, 7582, 8001, 2754, 132, 2131, 2448, 5098]
                                   title  vote_count  ...  year        wr
7648                           Inception       14075  ...  2010  8.014597
8613                        Interstellar       11187  ...  2014  7.993373
3381                             Memento        4168  ...  2000  7.830744
6623                        The Prestige        4510  ...  2006  7.758148
8031               The Dark Knight Rises        9263  ...  2012  7.494595
6218                       Batman Begins        7511  ...  2005  7.376814
1134                      Batman Returns        1706  ...  1992  6.325180
9024  Batman v Superman: Dawn of Justice        7189  ...  2016  5.674090
132                       Batman Forever        1529  ...  1995  5.209926
1260                      Batman & Robin        1447  ...  1997  4.441087

[10 rows x 5 c

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8.1,2010,8.014597
8613,Interstellar,11187,8.1,2014,7.993373
3381,Memento,4168,8.1,2000,7.830744
6623,The Prestige,4510,8.0,2006,7.758148
8031,The Dark Knight Rises,9263,7.6,2012,7.494595
6218,Batman Begins,7511,7.5,2005,7.376814
1134,Batman Returns,1706,6.6,1992,6.32518
9024,Batman v Superman: Dawn of Justice,7189,5.7,2016,5.67409
132,Batman Forever,1529,5.2,1995,5.209926
1260,Batman & Robin,1447,4.2,1997,4.441087


In [None]:
improved_recommendations('Mean Girls')

Mean Girls
5207
[3319, 4763, 1329, 6277, 7905, 7332, 6959, 8883, 6698, 7377, 3712, 7494, 5542, 5163, 5092, 1547, 2005, 8844, 5152, 7084, 7436, 7688, 4996, 6449, 390]
                                        title  vote_count  ...  year        wr
1547                       The Breakfast Club        2189  ...  1985  7.377234
390                        Dazed and Confused         588  ...  1993  6.484819
8883                                 The DUFF        1372  ...  2015  6.426293
3712                     The Princess Diaries        1063  ...  2001  6.136129
6277                         Just Like Heaven         595  ...  2005  5.970637
6959                The Spiderwick Chronicles         593  ...  2008  5.854124
4763                            Freaky Friday         919  ...  2003  5.757786
7905                    Mr. Popper's Penguins         775  ...  2011  5.536630
7332               Ghosts of Girlfriends Past         716  ...  2009  5.465987
7494  American Pie Presents: The Book of Lov

Unnamed: 0,title,vote_count,vote_average,year,wr
1547,The Breakfast Club,2189,7.8,1985,7.377234
390,Dazed and Confused,588,7.4,1993,6.484819
8883,The DUFF,1372,6.8,2015,6.426293
3712,The Princess Diaries,1063,6.5,2001,6.136129
6277,Just Like Heaven,595,6.5,2005,5.970637
6959,The Spiderwick Chronicles,593,6.3,2008,5.854124
4763,Freaky Friday,919,6.0,2003,5.757786
7905,Mr. Popper's Penguins,775,5.7,2011,5.53663
7332,Ghosts of Girlfriends Past,716,5.6,2009,5.465987
7494,American Pie Presents: The Book of Love,454,5.1,2009,5.170817


In [None]:
# 협업 필터링(Collaborative Filtering)

In [None]:
# surprise 라이브러리의 Reader
reader = Reader()

In [None]:
ratings = pd.read_csv('./drive/MyDrive/data-files/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [None]:
data = Dataset.load_from_df(ratings[['userId', 'movieId','rating']], reader)
# data.split(n_folds=5)

trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [None]:
type(trainset), type(testset)

(surprise.trainset.Trainset, list)

In [None]:
testset[:5]

[(1, 31, 2.5), (1, 1029, 3.0), (1, 1061, 3.0), (1, 1129, 2.0), (1, 1172, 4.0)]

In [None]:
trainset.n_users, trainset.n_items, trainset.n_ratings, trainset.n_users * trainset.n_items

(671, 9066, 100004, 6083286)

In [None]:
trainset.all_users(), trainset.all_items()

(range(0, 671), range(0, 9066))

In [None]:
[rating for rating in trainset.all_ratings()][:10]

[(0, 0, 2.5),
 (0, 1, 3.0),
 (0, 2, 3.0),
 (0, 3, 2.0),
 (0, 4, 4.0),
 (0, 5, 2.0),
 (0, 6, 2.0),
 (0, 7, 2.0),
 (0, 8, 3.5),
 (0, 9, 2.0)]

In [None]:
from surprise import SVD, KNNBasic

svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f67cc541a50>

In [None]:
predictions = svd.test(testset)

In [None]:
type(predictions)
predictions[:5]

[Prediction(uid=1, iid=31, r_ui=2.5, est=2.3666665905859703, details={'was_impossible': False}),
 Prediction(uid=1, iid=1029, r_ui=3.0, est=2.871774154447293, details={'was_impossible': False}),
 Prediction(uid=1, iid=1061, r_ui=3.0, est=2.579851877131141, details={'was_impossible': False}),
 Prediction(uid=1, iid=1129, r_ui=2.0, est=2.3509526107718712, details={'was_impossible': False}),
 Prediction(uid=1, iid=1172, r_ui=4.0, est=3.581638499079425, details={'was_impossible': False})]

In [None]:
testset[:5]

[(1, 31, 2.5), (1, 1029, 3.0), (1, 1061, 3.0), (1, 1129, 2.0), (1, 1172, 4.0)]

In [None]:
from surprise import accuracy

print( accuracy.rmse(predictions) )
print( accuracy.mae(predictions) )

RMSE: 0.6430
0.6430457395890116
MAE:  0.4975
0.49754456434820526


In [None]:
svd.predict('690', '431')

Prediction(uid='690', iid='431', r_ui=None, est=3.543608255669773, details={'was_impossible': False})

In [None]:
ratings = pd.read_csv('./drive/MyDrive/data-files/ratings1.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
ratings["rating"].describe()

count    199999.000000
mean          3.566413
std           1.046466
min           0.500000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [None]:
ratings.to_csv('./drive/MyDrive/data-files/ratings-noh.csv', header=False, index=False)

In [None]:
from surprise import Reader

reader = Reader(line_format="user item rating timestamp", sep=",", rating_scale=(0.5, 5))
data2 = Dataset.load_from_file('./drive/MyDrive/data-files/ratings-noh.csv', reader)

In [None]:
data2

<surprise.dataset.DatasetAutoFolds at 0x7f67cc523fd0>

In [None]:
train_set2, test_set2 = train_test_split(data2, test_size=0.2, random_state=42)

In [None]:
type(train_set2), type(test_set2)

(surprise.trainset.Trainset, list)

In [None]:
train_set2.n_users, train_set2.n_items

(1409, 11923)

In [None]:
svd2 = SVD(n_factors=50, random_state=42)

svd2.fit(train_set2)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f67dfd14c10>

In [None]:
predictions2 = svd2.test(test_set2)

In [None]:
predictions2[:5]

[Prediction(uid='309', iid='587', r_ui=4.0, est=3.759808404039317, details={'was_impossible': False}),
 Prediction(uid='1277', iid='1089', r_ui=4.0, est=4.4615038442561, details={'was_impossible': False}),
 Prediction(uid='847', iid='1088', r_ui=1.0, est=3.1898635272661795, details={'was_impossible': False}),
 Prediction(uid='715', iid='68157', r_ui=5.0, est=3.9024484596387725, details={'was_impossible': False}),
 Prediction(uid='168', iid='165', r_ui=4.0, est=3.5179064473225607, details={'was_impossible': False})]

In [None]:
accuracy.rmse(predictions2), accuracy.mae(predictions2)

RMSE: 0.8687
MAE:  0.6664


(0.8687000057890459, 0.6663514499848416)

In [None]:
svd2.predict(str(150), str(5000))

Prediction(uid='150', iid='5000', r_ui=None, est=3.9193257691920818, details={'was_impossible': False})

In [None]:
movies = pd.read_csv('./drive/MyDrive/data-files/moives.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
all_movies = movies["movieId"].values # 전체 영화 목록
rated_movies = ratings[ratings["userId"] == 1]["movieId"].values # 1번 사용자가 평가한 영화 목록

not_rated_movies = [ movie for movie in all_movies if movie not in rated_movies ] # 1번 사용자가 평가하지 않은 영화 목록

In [None]:
len(all_movies), len(rated_movies), len(not_rated_movies)

(62423, 70, 62353)

In [None]:
predictions3 = [ svd2.predict(str(1), str(movie)) for movie in not_rated_movies ]

In [None]:
predictions3[:5]

[Prediction(uid='1', iid='1', r_ui=None, est=3.8971332966415853, details={'was_impossible': False}),
 Prediction(uid='1', iid='2', r_ui=None, est=3.348081946505535, details={'was_impossible': False}),
 Prediction(uid='1', iid='3', r_ui=None, est=3.195987181858515, details={'was_impossible': False}),
 Prediction(uid='1', iid='4', r_ui=None, est=3.1863771565983634, details={'was_impossible': False}),
 Prediction(uid='1', iid='5', r_ui=None, est=3.175170888991397, details={'was_impossible': False})]

In [None]:
predictions3[0].est

3.8971332966415853

In [None]:
predictions3.sort(key=lambda p: p.est, reverse=True)

In [None]:
import pandas as pd
import numpy as np

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [None]:
top_10_rated_predictions = predictions3[:10]

In [None]:
recommendations = [ (p.est, movies[movies["movieId"] == int(p.iid)]["title"].values[0]) for p in top_10_rated_predictions ]
recommendations

[(4.595704903858456, 'Shawshank Redemption, The (1994)'),
 (4.574950740087123, "One Flew Over the Cuckoo's Nest (1975)"),
 (4.537284595861249, 'Usual Suspects, The (1995)'),
 (4.519412310732018, 'Princess Mononoke (Mononoke-hime) (1997)'),
 (4.512932172433042, 'Goodfellas (1990)'),
 (4.492102949337958, 'When We Were Kings (1996)'),
 (4.480883192471522, 'Fargo (1996)'),
 (4.479734928816546, '12 Angry Men (1957)'),
 (4.469674008385341, 'Life Is Beautiful (La Vita è bella) (1997)'),
 (4.467763776180294, 'Planet Earth II (2016)')]

In [None]:
# 하이브리드 추천 시스템(Hybrid Recommender)

In [None]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
65,1,27193,3.0,1147879774
66,1,27266,4.5,1147879365
67,1,27721,3.0,1147869115
68,1,31956,3.5,1147877610


In [None]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.5898351099477734, details={'was_impossible': False})

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [None]:
id_map = pd.read_csv('./drive/MyDrive/data-files/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [None]:
indices_map = id_map.set_index('id')

In [None]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year','id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [None]:
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
974,Aliens,3282.0,7.7,1986,679,3.112912
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.073724
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.004243
1011,The Terminator,4208.0,7.4,1984,218,2.987423
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.972054
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.807093
922,The Abyss,822.0,7.1,1989,2756,2.785464
1668,Return from Witch Mountain,38.0,5.6,1978,14822,2.752883
2014,Fantastic Planet,140.0,7.6,1973,16306,2.724975
7265,Dragonball Evolution,475.0,2.9,2009,14164,2.608125


In [None]:
hybrid(500, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
974,Aliens,3282.0,7.7,1986,679,3.578726
1011,The Terminator,4208.0,7.4,1984,218,3.447153
2014,Fantastic Planet,140.0,7.6,1973,16306,3.296441
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.255346
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.146035
1376,Titanic,7770.0,7.5,1997,597,3.102066
344,True Lies,1138.0,6.8,1994,36955,3.05487
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,3.040007
831,Escape to Witch Mountain,60.0,6.5,1975,14821,3.017434
7265,Dragonball Evolution,475.0,2.9,2009,14164,3.014873
