# 01. Data Load

- MovieLens 데이터셋을 다운 받고 전처리 한 후 데이터프레임 형태로 csv 저장

> Load the MovieLens data

In [2]:
# 기본 import
import pandas as pd

In [2]:
# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

Downloading movielens data...
Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


In [4]:
# user data
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
# ratings data
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
# movie data
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols

movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


> 데이터 전처리

- users

In [7]:
users.info()

# Null 값 없음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   sex         943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [8]:
users.describe()

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [9]:
users['sex'].value_counts()

M    670
F    273
Name: sex, dtype: int64

In [10]:
users['occupation'].value_counts()

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
lawyer            12
salesman          12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

In [11]:
# zipcode 컬럼 삭제
users.drop(columns='zip_code', inplace=True)
users.head()

Unnamed: 0,user_id,age,sex,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other


- ratings

In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   user_id         100000 non-null  int64
 1   movie_id        100000 non-null  int64
 2   rating          100000 non-null  int64
 3   unix_timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [13]:
ratings.describe()

# rating: 1~5점

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [14]:
# unix_timestamp 컬럼 삭제
ratings.drop(columns='unix_timestamp', inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [15]:
ratings['rating'].value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64

- movies

In [16]:
movies.info()

# video_release_date 컬럼 삭제
# 각 장르 컬럼들로 all_genre 컬럼 생성 한 후 drop


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   imdb_url            1679 non-null   object 
 5   genre_unknown       1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children            1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [17]:
movies.describe()

Unnamed: 0,movie_id,video_release_date,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
count,1682.0,0.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,...,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0
mean,841.5,,0.001189,0.149227,0.080262,0.02497,0.072533,0.300238,0.064804,0.029727,...,0.01308,0.014269,0.054697,0.033294,0.036266,0.146849,0.060048,0.149227,0.042212,0.016052
std,485.695893,,0.034473,0.356418,0.271779,0.156081,0.259445,0.458498,0.246253,0.169882,...,0.11365,0.118632,0.227455,0.179456,0.187008,0.354061,0.237646,0.356418,0.201131,0.125714
min,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,421.25,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,841.5,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1261.75,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1682.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
# video_release_date 컬럼 삭제
movies.drop(columns='video_release_date', inplace=True)
movies.head(2)

Unnamed: 0,movie_id,title,release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
# all genre 컬럼 생성
def get_all_genres(gs):
    active = [genre for genre, g in zip(genre_cols, gs) if g==1]
    if len(active) == 0:
        return 'Other'
    return '-'.join(active)

movies['all_genres'] = [get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genre_cols])]
movies.head(2)

Unnamed: 0,movie_id,title,release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,all_genres
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,Animation-Children-Comedy
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,Action-Adventure-Thriller


In [20]:
# ID를 0 부터 시작으로 변환
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

- Create movielens df

In [21]:
# Create one merged DataFrame containing all the movielens data.
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')
movielens.head()

Unnamed: 0,user_id,movie_id,rating,title,release_date,imdb_url,genre_unknown,Action,Adventure,Animation,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,all_genres,age,sex,occupation
0,195,241,3.0,Kolya (1996),24-Jan-1997,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,0,0,...,0,0,0,0,0,0,Comedy,49,M,writer
1,195,256,2.0,Men in Black (1997),04-Jul-1997,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,1,0,...,0,0,1,0,0,0,Action-Adventure-Comedy-Sci-Fi,49,M,writer
2,195,110,4.0,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,0,0,...,0,1,0,0,0,0,Comedy-Romance,49,M,writer
3,195,24,4.0,"Birdcage, The (1996)",08-Mar-1996,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,0,0,...,0,0,0,0,0,0,Comedy,49,M,writer
4,195,381,4.0,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,0,0,...,0,0,0,0,0,0,Comedy-Drama,49,M,writer


In [22]:
movies.head(1)

Unnamed: 0,movie_id,title,release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,all_genres
0,0,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,Animation-Children-Comedy


In [25]:
# movies genre 컬럼 drop
drop_cols = movies.columns.difference(['movie_id','title','release_date','imdb_url','all_genres'])
movies.drop(columns=drop_cols, inplace=True)

> csv 저장

In [27]:
# csv 저장
users.to_csv('../data/users.csv', index=False)
ratings.to_csv('../data/ratings.csv', index=False)
movies.to_csv('../data/movies.csv', index=False)
movielens.to_csv('../data/movielens.csv', index=False)

In [None]:
# csv 확인
users = pd.read_csv('../data/users.csv')
users.info()

In [2]:
import pandas as pd
ratings = pd.read_csv('../data/ratings.csv')
ratings['rating'].unique()

array([3., 1., 2., 4., 5.])

In [30]:
movies = pd.read_csv('../data/movies.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie_id      1682 non-null   int64 
 1   title         1682 non-null   object
 2   release_date  1681 non-null   object
 3   imdb_url      1679 non-null   object
 4   all_genres    1682 non-null   object
dtypes: int64(1), object(4)
memory usage: 65.8+ KB


In [31]:
movielens = pd.read_csv('../data/movielens.csv')
movielens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        100000 non-null  int64  
 1   movie_id       100000 non-null  int64  
 2   rating         100000 non-null  float64
 3   title          100000 non-null  object 
 4   release_date   99991 non-null   object 
 5   imdb_url       99987 non-null   object 
 6   genre_unknown  100000 non-null  int64  
 7   Action         100000 non-null  int64  
 8   Adventure      100000 non-null  int64  
 9   Animation      100000 non-null  int64  
 10  Children       100000 non-null  int64  
 11  Comedy         100000 non-null  int64  
 12  Crime          100000 non-null  int64  
 13  Documentary    100000 non-null  int64  
 14  Drama          100000 non-null  int64  
 15  Fantasy        100000 non-null  int64  
 16  Film-Noir      100000 non-null  int64  
 17  Horror         100000 non-null

---