# 0. 데이터 불러오기
- 컨텐츠
    - 데이터: animes (16214개) = anime.csv (17562개) + anime_with_synopsis.csv (16214개) 
- 유저
    - 데이터: animelist.csv (유저 325770명, 영화 17562개, 평가 62M개)

In [None]:
INPUT_DIR = '/kaggle/input/anime-recommendation-database-2020'
#!ls {INPUT_DIR}

import pandas as pd
import warnings; warnings.filterwarnings("always"); warnings.filterwarnings(action='ignore')

""" 1) anime 데이터셋 """

anime = pd.read_csv(INPUT_DIR + '/anime.csv')
anime_with_synopsis = pd.read_csv(INPUT_DIR + '/anime_with_synopsis.csv', usecols=["MAL_ID", "sypnopsis"])
animes = pd.merge(anime, anime_with_synopsis, on='MAL_ID') # anime + synopsis 합침
print('number of animes: ', len(animes.MAL_ID.unique()))

""" 2) user rating 데이터셋 """ # 0 if the user didn't assign a score

animelist = pd.read_csv(INPUT_DIR + '/animelist.csv', usecols=["user_id", "anime_id", "rating"])
# Users who rated more than 5000 animies
n_ratings = animelist['user_id'].value_counts()
rating_df = animelist[animelist['user_id'].isin(n_ratings[n_ratings >= 5000].index)]
print('number of users: ', len(rating_df.user_id.unique()))

""" 결측치 처리 """

animes = animes.dropna()
rating_df = rating_df.dropna()

""" 중복값 처리 """

def remove_duplicated_rows(df):
    print('dataframe')
    duplicates = df.duplicated()
    if duplicates.sum() > 0:
        print('> {} duplicates'.format(duplicates.sum()))
        df = df[~duplicates]
    print('> {} duplicates'.format(df.duplicated().sum()))

remove_duplicated_rows(animes)
remove_duplicated_rows(rating_df)

# 0. 영호형 데이터 불러오기

In [None]:
"""import pandas as pd
import warnings; warnings.filterwarnings("always"); warnings.filterwarnings(action='ignore')

animes = pd.read_feather('../input/anime-data-save-as-feather/animes.feather')
animes = animes.rename({'anime_id': 'MAL_ID'}, axis='columns')
print(animes.shape)

rating_df = pd.read_feather('../input/anime-data-save-as-feather/rating_df.feather')
print(rating_df.shape)

animes.sample()"""

# 1. 데이터셋 생성

## 1-1. feature matrix (ani_df)

In [None]:
""" 사용할 feature 선택 """

ani_df = animes[['MAL_ID', 'Name', 'Genders', 'Type','Episodes', 'Aired','Source', 'Duration', 'Rating', 'Ranked', 'sypnopsis']]
print(ani_df.shape)
ani_df.sample(3)

### 1-1-1. 결측치('Unknown') 처리
- 사용 feature: 모두
    - sypnopsis: "No synopsis information ..." & "No synopsis has ..." 삭제
    - Source: 최빈값 대치
    - 나머지: 삭제

In [None]:
""" 결측치 갯수 """
ani_df[ani_df == 'Unknown'].count()

In [None]:
"""
1) sypnopsis 
"""     
ani_df[~ani_df['sypnopsis'].str.contains("No synopsis")]

"""
2) Source
"""
ani_df['Source'] = ani_df['Source'].replace("Unknown", 'Original') # Original 5000개인데 좀 오바인듯 

"""
3) 나머지
"""
ani_df = ani_df[ani_df['Genders'] != 'Unknown']
ani_df = ani_df[ani_df['Type'] != 'Unknown']
ani_df = ani_df[ani_df['Episodes'] != 'Unknown']
ani_df = ani_df[ani_df['Aired'] != 'Unknown']
ani_df = ani_df[ani_df['Source'] != 'Unknown']
ani_df = ani_df[ani_df['Duration'] != 'Unknown']
ani_df = ani_df[ani_df['Rating'] != 'Unknown']
ani_df = ani_df[ani_df['Ranked'] != 'Unknown']
print('remaining: ', ani_df.shape)

"""
reset index
"""
ani_df = ani_df.reset_index() 

In [None]:
""" 결측치 갯수 """
ani_df[ani_df == 'Unknown'].count()

### 1-1-2. 전처리 (numeric)
- 사용 feature: Aired, Duration, Ranked

In [None]:
""" 
1) Aired: 날짜 형식 통일 (e.g., Spe 1, 2001 => 2001) 
"""

years  = []
months = []
for val in ani_df['Aired']:
    vr = val.split()
    y = 'Unknown'
    m = 'Unknown'
    for v in vr:
        if v.isdigit() and len(v) == 4 :
            y = v
            break
    for v in vr:
        if not v.isdigit() and len(v) >= 3 and v[0].isupper() and v != 'Unknown' :
            m = v[:3]
            break
        
    years += [ y ]
    months += [ m ]

ani_df['Year'] = years
ani_df['Month'] = months

month_to_number = {
'Jan' : 1,         
'Feb' : 2,         
'Mar' : 3,           
'Apr' : 4,              
'May' : 5, 
'Jun' : 6,
'Jul' : 7, 
'Aug' : 8, 
'Sep' : 9, 
'Oct' : 10, 
'Nov' : 11, 
'Dec' : 12}

ani_df['Month'] = ani_df['Month'].replace(month_to_number)

ani_df['Year'] = ani_df['Year'].replace("Unknown", 0).astype(float)
ani_df['Month'] = ani_df['Month'].replace("Unknown", 0).astype(float)

ani_df['date'] = ani_df['Year']

""" 
2) Duration: 단위 통일 (e.g., 1 hr. 55 min. => 115) 
"""

hrs  = []
mins = []
for val in ani_df['Duration']:
    split_list = val.split() # ['24', 'min.', 'per', 'ep.']
    h = 'Unknown'
    m = 'Unknown'
    for i in split_list:
        if i == 'hr.':
            h = split_list[split_list.index(i)-1]
        elif i == 'min.':
            m = split_list[split_list.index(i)-1]
        
    hrs += [ h ]
    mins += [ m ]

ani_df['hours'] = hrs
ani_df['mins'] = mins

ani_df['hours'] = ani_df['hours'].replace("Unknown", 0).astype(float)
ani_df['mins'] = ani_df['mins'].replace("Unknown", 0).astype(float)

ani_df['duration'] = (ani_df['hours']*60) + ani_df['mins']

""" 
3) Ranked: str -> int 타입 변환 
"""

try:
    ani_df['Ranked'] = ani_df['Ranked'].replace("Unknown", 0).astype(float)
except:
    pass
ani_df["Ranked"] = pd.to_numeric(ani_df["Ranked"])


""" 
출력 
"""

ani_df = ani_df.drop(['Aired', 'Duration','Year', 'Month','hours', 'mins' ], axis = 1)
print(ani_df.shape)
ani_df.head()

### 1-1-3. 전처리 (text) 
- 사용 feature: Genders, Sypnopsis, Type, Source, Rating
    - BoW: Genders
    - TF-IDF: sypnopsis
    - OneHotEncoding: Type, Source, Rating

In [None]:
""" 
1) BoW: Genders 
"""

from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
count_vector.fit(ani_df['Genders'])
#print('number of genres: ', len(count_vector.get_feature_names())) # action, comedy, ...

doc_array = count_vector.transform(ani_df['Genders']).toarray()
frequency_matrix = pd.DataFrame(doc_array, 
                                columns = count_vector.get_feature_names())
print('BoW of Genders shape: ', frequency_matrix.shape)

""" 
2) tf-idf: sypnopsis 
"""

#ani_df['sypnopsis'] = ani_df['sypnopsis'].map(lambda x: ','.join(x))

def process_multilabel(series):
    series = series.split(" ")
    if "Unknown" in series:
        series.remove("Unknown")
    return series
ani_df["sypnopsis"] = ani_df["sypnopsis"].map(process_multilabel)

from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=100, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')
sypnopsis_original = ani_df['sypnopsis'].fillna('').astype(str) # Filling NaNs with empty string
sypnopsis_vector_tf_idf = tfv.fit_transform(sypnopsis_original)
sypnopsis_matrix = pd.DataFrame(sypnopsis_vector_tf_idf.toarray())
print("tf-idf of sypnopsis shape:", sypnopsis_vector_tf_idf.shape)

""" 
3) OneHotEncoding: Type, Source, Rating 
"""

from sklearn.preprocessing import OneHotEncoder

Type = ani_df['Type'].values.reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(Type)
type_ohe = enc.transform(Type).toarray()
type_ohe = pd.DataFrame(type_ohe, columns = enc.get_feature_names())

Source = ani_df['Source'].values.reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(Source)
source_ohe = enc.transform(Source).toarray()
source_ohe = pd.DataFrame(source_ohe, columns = enc.get_feature_names())

Rating = ani_df['Rating'].values.reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(Rating)
rating_ohe = enc.transform(Rating).toarray()
rating_ohe = pd.DataFrame(rating_ohe, columns = enc.get_feature_names())


## 1-2. score matrix (score_df)

In [None]:
a = len(rating_df.anime_id.unique())
b = len(rating_df.user_id.unique())

print('animes: ',a)
print('users: ', b)
print('\n', '{0}명의 유저가 {1}개의 영화를 평가한 matrix 생성'.format(b,a), '\n')

import numpy as np

top_users = rating_df.groupby('user_id')['rating'].count()
top_r = rating_df.join(top_users, rsuffix='_r', how='inner', on='user_id')

top_animes = rating_df.groupby('anime_id')['rating'].count()
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='anime_id')

score_df = pd.crosstab(top_r.user_id, top_r.anime_id, top_r.rating, aggfunc=np.sum)
score_df

## 1-3. user matrix (user_df)
- ani_df + score_df

In [None]:
all_users_df = []

user_id_list = list(score_df.index)

for user_id in user_id_list:
    """ 유저 별 score vector """
    score_vector = score_df.loc[user_id] 
    score_vector = score_vector.dropna() 
    score_vector = score_vector[score_vector != 0] # 0점은 평가 안한것 (여기서 많이 날라가는듯)

    anime_id_list = list(score_vector.index)
    user_score_list = list(score_vector.values)

    """ 유저가 본 영화만 추출 """
    ani_df_user = ani_df.loc[ani_df['MAL_ID'].isin(anime_id_list)] 

    """ 유저 별 매트릭스 생성 """
    score_vector_df = pd.DataFrame(score_vector)
    score_vector_df['MAL_ID'] = score_vector_df.index
    score_vector_df.columns = ['score_by_user_{}'.format(user_id), 'MAL_ID']

    user_df = pd.merge(ani_df_user,score_vector_df, how='inner',on='MAL_ID')
    
    """ append user_df """
    all_users_df.append(user_df)

a = len(all_users_df)
print('users: ', a)
print('\n', '{0}명의 유저에 대해 아래 matrix를 각각 생성'.format(a), '\n')
all_users_df[0]

In [None]:
""" 최종 사용할 데이터 수 """

print('유저 수: ', len(all_users_df))

num = 0
for i in all_users_df:
    num += len(i)
    
print('유저 1명 당 평균 평가 갯수: ', round(num/len(all_users_df),1))

# 2. 데이터셋 저장

## 2-1. vector space model
- 사용 feature: Genders, sypnopsis

In [None]:
combined_matrix = frequency_matrix.join(sypnopsis_matrix, how='left')
data = ani_df.join(combined_matrix, how='left')
data = data.drop(data.loc[:, 'Name':'duration'].columns, axis = 1)
data = data.drop(columns=['index'])
data

## 2-2. prediction model
- 사용 feature: Episodes, Ranked, date, duration, Type, Source, Rating,

In [None]:
data = ani_df[['MAL_ID', 'Episodes', 'Ranked', 'date', 'duration']] # numeric

from sklearn.preprocessing import MinMaxScaler
data[["Episodes"]] = MinMaxScaler().fit_transform(data[["Episodes"]])
data[["Ranked"]] = MinMaxScaler().fit_transform(data[["Ranked"]])
data[["date"]] = MinMaxScaler().fit_transform(data[["date"]])
data[["duration"]] = MinMaxScaler().fit_transform(data[["duration"]])

try:
    type_ohe.rename(columns = {'x0_Music' : 'x0_Music_'}, inplace = True)
except:
    pass

data = data.join(type_ohe, how='left')
data = data.join(source_ohe, how='left')
data = data.join(rating_ohe, how='left')
data