## 1. CSV 데이터 가져오기

* movie_df : 영화에 대한 정보
* genre_df : 영화 별 장르에 대한 정보
* rating_df : 영화에 대한 별점 정보
* 데이터 출처 - grouplens movielens dataset

In [None]:
$pip install tensorflow
$pip install matplotlib
$pip install pandas

In [11]:
# 필요한 라이브러리 가져오기
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from tensorflow.keras.utils import get_file

# 영화 포스터를 가져와 주피터에서 볼 수 있도록 만들어주는 메소드
def display_posters(movieId_list):
    import requests
    from io import BytesIO
    from PIL import Image
    
    def get_poster(movieId):
        url = "https://pai-datasets.s3.ap-northeast-2.amazonaws.com/recommender_systems/movielens/img/POSTER_20M_FULL/{}.jpg".format(movieId)
        try:
            response = requests.get(url)
            b = BytesIO(response.content)
            img = np.asarray(Image.open(b))
        except:
            img = np.zeros((200,100,3))
        return img
    
    images = []
    for i in movieId_list:
        img = get_poster(i)
        images.append(img)

    columns = 5
    rows = (len(movieId_list) - 1) // columns + 1
    fig, axes = plt.subplots(rows, columns)
    axes = axes.flatten()
    fig.set_size_inches((20,rows * 5))
        
    for ind, img in enumerate(images):
        c_id = ind % columns
        r_id = ind // columns
        ax = axes[r_id*columns + c_id]
        ax.set_title(movieId_list[ind])
        ax.imshow(img)

    plt.tight_layout()

In [12]:
ROOT_URL = "https://pai-datasets.s3.ap-northeast-2.amazonaws.com/recommender_systems/movielens/datasets"

movie_path = get_file("movies.csv", os.path.join(ROOT_URL, "movies.csv"))
movie_df = pd.read_csv(movie_path)

genre_path = get_file("genres.csv", os.path.join(ROOT_URL, "genres.csv"))
genre_df = pd.read_csv(genre_path)

rating_path = get_file("ratings.csv", os.path.join(ROOT_URL, "ratings.csv"))
rating_df = pd.read_csv(rating_path)

Downloading data from https://pai-datasets.s3.ap-northeast-2.amazonaws.com/recommender_systems/movielens/datasets/movies.csv
Downloading data from https://pai-datasets.s3.ap-northeast-2.amazonaws.com/recommender_systems/movielens/datasets/genres.csv
Downloading data from https://pai-datasets.s3.ap-northeast-2.amazonaws.com/recommender_systems/movielens/datasets/ratings.csv


# 2. Movie-Lens 데이터베이스의 테이블 확인하기

## (1) movie dataframe 확인하기

데이터 일부 가져오기

In [13]:
movie_df

Unnamed: 0,id,title,release_year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
...,...,...,...
27273,131254,Kein Bund für's Leben,2007
27274,131256,"Feuer, Eis & Dosenbier",2002
27275,131258,The Pirates,2014
27276,131260,Rentun Ruusu,2001


전체 행 갯수 확인하기

In [14]:
movie_df.id.nunique()

27278

In [19]:
movie_df.title.nunique()

26213

In [21]:
movie_df.title.duplicated()
# 중복되는 경우 True, 유니크한 경우 False

0        False
1        False
2        False
3        False
4        False
         ...  
27273    False
27274    False
27275    False
27276    False
27277     True
Name: title, Length: 27278, dtype: bool

In [24]:
movie_df[movie_df.title.duplicated(False)].sort_values('title')

Unnamed: 0,id,title,release_year
26816,128878,1,2014
23829,113214,1,2013
1178,1203,12 Angry Men,1957
15253,77846,12 Angry Men,1997
13045,62383,"20,000 Leagues Under the Sea",1916
...,...,...,...
22698,108692,Yu-Gi-Oh!,1999
11692,51540,Zodiac,2007
26347,126579,Zodiac,2014
5800,5899,Zulu,1964


In [28]:
# 개봉년도 최소값 찾기
movie_df.release_year.min()

1891

In [29]:
# 개봉년도 최대값 찾기
movie_df.release_year.max()

2015

## (2) rating dataframe 확인하기

In [30]:
rating_df

Unnamed: 0,user_id,movie_id,rating,rated_at
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944


In [32]:
# 유저 수 확인
rating_df.user_id.nunique()

138493

In [33]:
# rating된 영화의 개수
rating_df.movie_id.nunique()

26744

In [34]:
# 평점 최소값
rating_df.rating.min()

0.5

In [35]:
# 평점 최대값
rating_df.rating.max()

5.0

## (3) genre dataframe 확인하기

In [36]:
genre_df

Unnamed: 0,movie_id,genre
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy
...,...,...
54401,131258,Adventure
54402,131260,(no genres listed)
54403,131262,Adventure
54404,131262,Fantasy


In [38]:
genre_df.genre.unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [39]:
genre_df.genre.nunique()

20

In [41]:
# 장르별 데이터 수 확인하기
genre_df.genre.value_counts()

Drama                 13344
Comedy                 8374
Thriller               4178
Romance                4127
Action                 3520
Crime                  2939
Horror                 2611
Documentary            2471
Adventure              2329
Sci-Fi                 1743
Mystery                1514
Fantasy                1412
War                    1194
Children               1139
Musical                1036
Animation              1027
Western                 676
Film-Noir               330
(no genres listed)      246
IMAX                    196
Name: genre, dtype: int64

# 3. 데이터 탐색