In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# load all dataset
anime_df_merged = pd.read_csv('./preprocessed_dataset/anime_df_merged.csv')
anime_with_synopsis_df = pd.read_csv('./preprocessed_dataset/anime_with_synopsis.csv')
rating_df_lite_version = pd.read_csv('./preprocessed_dataset/rating_df_lite_version.csv')

# Data Preprocessing and Preparation

### Feature Engineering

#### Dataframe `anime_merged`
***
Feature engineering yang dilakukan pada dataframe **anime_merged** adalah: <br>
1. Encoding MAL_ID agar index integer lebih teratur
2. Ekstrak kolom **Premiered** menjadi dua informasi: **season_premiered** dan **year_premiered**
2. Ekstrak kolom **Aired** untuk mendapatkan informasi kapan anime tersebut mulai ditayangkan pertama kali (menjadi **start_aired**). Secara intuisi penonton mungkin menggemari anime pada tahun-tahun tertentu. Informasi mengenai kapan anime tersebut berakhir tidak dicatat karena asumsinya tidak terlalu diperhatikan penonton dalam pertimbangannya untuk menonton
2. Ekstrak kolom **Duration** untuk mendapatkan informasi durasi dalam jam-menit-detik
2. Ubah nilai pada kolom-kolom dengan tipe kategorikal menjadi bentuk one-hot-vector dengan TF-IDF ataupun OneHotEncoding. Pada project kali ini teknik One-hot Encoding dipilih dan digunakan dibandingkan TF-IDF karena lebih masuk akal secara logika dengan memperlihatkan suatu Genre masuk ke anime apa saja, daripada menghitung bobot kemunculan Genre tersebut yang lebih masuk akal untuk studi kasus Sentimen Analisis. Kolom-kolom yang akan dilakukan one-hot encoding yaitu tersebut yaitu:
    - **Genres**
    - **Type**
    - **Producers**
    - **Licensors**
    - **Studios**
    - **Source**
    - **Rating**
2. Normalisasi fitur bertipe numerikal dengan min-max normalization dari hasil fitur engineering sebelumnya

##### 1. Encoding MAL_ID

In [3]:
anime_ids = anime_df_merged['MAL_ID'].unique().tolist()
encoding_anime_ids = {x: i for i, x in enumerate(anime_ids)}

# encoding MAL_ID dengan data encoding
anime_df_merged['MAL_ID'] = anime_df_merged['MAL_ID'].map(encoding_anime_ids)

In [4]:
encoding_anime_ids

{1: 0,
 5: 1,
 6: 2,
 7: 3,
 8: 4,
 15: 5,
 16: 6,
 17: 7,
 18: 8,
 19: 9,
 20: 10,
 22: 11,
 23: 12,
 24: 13,
 25: 14,
 26: 15,
 27: 16,
 28: 17,
 29: 18,
 30: 19,
 31: 20,
 32: 21,
 33: 22,
 43: 23,
 44: 24,
 45: 25,
 46: 26,
 47: 27,
 48: 28,
 49: 29,
 50: 30,
 51: 31,
 52: 32,
 53: 33,
 54: 34,
 55: 35,
 56: 36,
 57: 37,
 58: 38,
 59: 39,
 60: 40,
 61: 41,
 62: 42,
 63: 43,
 64: 44,
 65: 45,
 66: 46,
 67: 47,
 68: 48,
 69: 49,
 71: 50,
 72: 51,
 73: 52,
 74: 53,
 75: 54,
 76: 55,
 77: 56,
 79: 57,
 80: 58,
 81: 59,
 82: 60,
 83: 61,
 84: 62,
 85: 63,
 86: 64,
 87: 65,
 88: 66,
 89: 67,
 90: 68,
 91: 69,
 92: 70,
 93: 71,
 94: 72,
 95: 73,
 96: 74,
 97: 75,
 98: 76,
 99: 77,
 100: 78,
 101: 79,
 102: 80,
 103: 81,
 104: 82,
 105: 83,
 106: 84,
 107: 85,
 108: 86,
 109: 87,
 110: 88,
 111: 89,
 112: 90,
 113: 91,
 114: 92,
 115: 93,
 116: 94,
 117: 95,
 118: 96,
 119: 97,
 120: 98,
 121: 99,
 122: 100,
 123: 101,
 124: 102,
 125: 103,
 126: 104,
 127: 105,
 128: 106,
 129: 107,
 130:

##### 2. Ekstrak informasi kolom `Premiered`

In [5]:
anime_df_merged[['season_premiered', 'year_premiered']] = anime_df_merged['Premiered'].str.split(' ', expand=True)

# drop column Premiered karena sudah dibutuhkan lagi
anime_df_merged.drop('Premiered', axis=1, inplace=True)

In [6]:
anime_df_merged.head()

Unnamed: 0,MAL_ID,Name,Genres,Type,Episodes,Aired,Producers,Licensors,Studios,Source,Duration,Rating,season_premiered,year_premiered
0,0,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,"Apr 3, 1998 to Apr 24, 1999",Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),Spring,1998
1,1,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sep 1, 2001","Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),Fall,2016
2,2,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,"Apr 1, 1998 to Sep 30, 1998",Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,Spring,1998
3,3,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"Jul 2, 2002 to Dec 24, 2002","TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,Summer,2002
4,4,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"Sep 30, 2004 to Sep 29, 2005","TV Tokyo, Dentsu",Funimation,Toei Animation,Manga,23 min. per ep.,PG - Children,Fall,2004


##### 3. Ekstrak informasi kolom `Aired`

In [7]:
def extract_aired_information(value):
    try:
        # Split to get the starting aired date
        start_aired = value.split('to')[0].strip()
        date_month, year = start_aired.split(', ')
        month, date = date_month.split(' ')
        return date, month, year
    except ValueError:
        # Handle missing or malformed dates
        return None, None, None


In [8]:
anime_df_merged[
    ['start_date_aired', 'start_month_aired', 'start_year_aired']
] = anime_df_merged['Aired'].apply(func=extract_aired_information).apply(pd.Series)

# drop column Aired karena sudah dibutuhkan lagi
anime_df_merged.drop('Aired', axis=1, inplace=True)

In [9]:
anime_df_merged.head()

Unnamed: 0,MAL_ID,Name,Genres,Type,Episodes,Producers,Licensors,Studios,Source,Duration,Rating,season_premiered,year_premiered,start_date_aired,start_month_aired,start_year_aired
0,0,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),Spring,1998,3,Apr,1998
1,1,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),Fall,2016,1,Sep,2001
2,2,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,Spring,1998,1,Apr,1998
3,3,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,Summer,2002,2,Jul,2002
4,4,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"TV Tokyo, Dentsu",Funimation,Toei Animation,Manga,23 min. per ep.,PG - Children,Fall,2004,30,Sep,2004


##### 4. Ekstrak informasi kolom `Duration`

In [10]:
def extract_duration_information(duration):
    hours = 0
    minutes = 0
    seconds = 0
    
    # check for hours in the durations tring
    if 'hr.' in duration:
        hours = int(duration.split('hr.')[0].strip())
        duration = duration.split('hr.')[1]
    
    # Check for minutes in the duration string
    if 'min.' in duration:
        minutes = int(duration.split('min.')[0].strip())
        duration = duration.split('min.')[1]
    
    # check for seconds in the duration string
    if 'sec.' in duration:
        seconds = int(duration.split('sec.')[0].strip())
    
    return hours, minutes, seconds

In [11]:
anime_df_merged[['duration_hours', 'duration_minutes', 'duration_seconds']] = anime_df_merged['Duration'].apply(
    lambda x: pd.Series(extract_duration_information(x))
)
# drop column Duration karena sudah dibutuhkan lagi
anime_df_merged.drop('Duration', axis=1, inplace=True)

In [12]:
anime_df_merged.head()

Unnamed: 0,MAL_ID,Name,Genres,Type,Episodes,Producers,Licensors,Studios,Source,Rating,season_premiered,year_premiered,start_date_aired,start_month_aired,start_year_aired,duration_hours,duration_minutes,duration_seconds
0,0,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,R - 17+ (violence & profanity),Spring,1998,3,Apr,1998,0,24,0
1,1,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,R - 17+ (violence & profanity),Fall,2016,1,Sep,2001,1,55,0
2,2,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,PG-13 - Teens 13 or older,Spring,1998,1,Apr,1998,0,24,0
3,3,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,PG-13 - Teens 13 or older,Summer,2002,2,Jul,2002,0,25,0
4,4,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,52,"TV Tokyo, Dentsu",Funimation,Toei Animation,Manga,PG - Children,Fall,2004,30,Sep,2004,0,23,0


##### 5. One-hot vector untuk kolom-kolom bertipe kategorikal

In [13]:
ohe_genres_df = anime_df_merged['Genres'].str.get_dummies(sep=', ').add_prefix('Genre_')
ohe_type_df = anime_df_merged['Type'].str.get_dummies(sep=', ').add_prefix('Type_')
ohe_producers_df = anime_df_merged['Producers'].str.get_dummies(sep=', ').add_prefix('Producers_')
ohe_licensors_df = anime_df_merged['Licensors'].str.get_dummies(sep=', ').add_prefix('Licensors_')
ohe_studios_df = anime_df_merged['Studios'].str.get_dummies(sep=', ').add_prefix('Studios_')
ohe_source_df = anime_df_merged['Source'].str.get_dummies(sep=', ').add_prefix('Source_')
ohe_rating_df = anime_df_merged['Rating'].str.get_dummies(sep=', ').add_prefix('Rating_')

##### 6. Normalisasi fitur numerikal dengan min-max normalization

In [14]:
# List of numerical columns to normalized
numerical_columns = [
    'Episodes',
    'year_premiered',
    'start_date_aired',
    'start_year_aired',
    'duration_hours',
    'duration_minutes',
    'duration_seconds'
]

# handle missing values first using mode statistic for each column
# Replace NaN values with the mode for each selected column
for column in numerical_columns:
    mode_value = anime_df_merged[column].mode()[0]  # Get the mode value (first mode in case of ties)
    anime_df_merged[column] = anime_df_merged[column].fillna(mode_value)  # Fill NaN with mode value

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max normalization to the selected columns
numerical_scaled_feature_df = scaler.fit_transform(anime_df_merged[numerical_columns])
numerical_scaled_feature_df = pd.DataFrame(numerical_scaled_feature_df, columns=numerical_columns)

In [15]:
# tambahkan informasi MAL_ID pada numerical features
numerical_scaled_feature_df = pd.concat([
    anime_df_merged[['MAL_ID']],
    numerical_scaled_feature_df
],axis=1)

In [16]:
numerical_scaled_feature_df.head()

Unnamed: 0,MAL_ID,Episodes,year_premiered,start_date_aired,start_year_aired,duration_hours,duration_minutes,duration_seconds
0,0,0.008181,0.616667,0.066667,0.778846,0.0,0.40678,0.0
1,1,0.0,0.916667,0.0,0.807692,0.5,0.932203,0.0
2,2,0.008181,0.616667,0.0,0.778846,0.0,0.40678,0.0
3,3,0.008181,0.683333,0.033333,0.817308,0.0,0.423729,0.0
4,4,0.016688,0.716667,0.966667,0.836538,0.0,0.389831,0.0


#### Dataframe `anime_with_synopsis`
***
1. Encoding MAL_ID agar index integer lebih teratur dan sesuai dengan mapping pada dataframe **`anime_merged`**
2. Preprocessing teks pada kolom **synopsis** agar bisa digunakan sebagai fitur tambahan pada sistem rekomendasi berbasis Content-based filtering. 
3. Ekstraksi informasi dari kolom **synopsis** berupa TF-IDF

##### 1. Encoding MAL_ID

In [17]:
anime_ids = anime_with_synopsis_df['MAL_ID'].unique().tolist()

# ambil index terakhir MAL_ID dari hasil encoding anime_merged
last_index = max(encoding_anime_ids.values())+1
for anime_id in anime_ids:
    if anime_id not in encoding_anime_ids:
        encoding_anime_ids[anime_id] = last_index
        last_index+=1

In [18]:
# encoding MAL_ID dengan data encoding
anime_with_synopsis_df['MAL_ID'] = anime_with_synopsis_df['MAL_ID'].map(encoding_anime_ids)

In [19]:
anime_with_synopsis_df

Unnamed: 0,MAL_ID,sypnopsis
0,0,"In the year 2071, humanity has colonized sever..."
1,1,"other day, another bounty—such is the life of ..."
2,2,"Vash the Stampede is the man with a $$60,000,0..."
3,3,ches are individuals with special powers like ...
4,4,It is the dark century and the people are suff...
...,...,...
16201,17342,No synopsis information has been added to this...
16202,17343,ko is a typical high school student whose life...
16203,17344,Sequel to Higurashi no Naku Koro ni Gou .
16204,17345,New Yama no Susume anime.


##### 2. Preprocessing teks pada kolom 'sypnopsis'

In [20]:
def preprocess_text(text):
    # Remove non-ASCII characters
    text = ''.join(char for char in text if ord(char) < 128)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = text.split()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming (you can also use lemmatization)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a single string
    return ' '.join(tokens)

In [21]:
# Apply preprocessing to the 'synopsis' column
anime_with_synopsis_df['processed_synopsis'] = anime_with_synopsis_df['sypnopsis'].apply(preprocess_text)

# drop sypnosis columns
anime_with_synopsis_df.drop('sypnopsis', axis=1, inplace=True)

##### 3. Ekstraksi fitur TF-IDF dari kolom 'sypnopsis'

In [22]:
# Initialize TfidfVectorizer
max_features = 1000  
tfidf_vectorizer = TfidfVectorizer(max_features=max_features, max_df=5)

# Fit and transform the processed synopsis column
tfidf_matrix = tfidf_vectorizer.fit_transform(anime_with_synopsis_df['processed_synopsis'])

# Convert TF-IDF matrix to DataFrame for better visualization (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [23]:
# add MAL_ID information to the tfidf_df
tfidf_df = pd.concat([
    anime_with_synopsis_df[['MAL_ID']],
    tfidf_df
],axis=1)

In [24]:
tfidf_df

Unnamed: 0,MAL_ID,abel,abh,advent,aeon,agetar,ah,ahiru,ajin,akeno,...,yusaku,yuuhi,yuusei,yuyu,zack,zaizen,zhan,zhou,zhu,zoid
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16201,17342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16202,17343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16203,17344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16204,17345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Dataframe `rating_df_lite_version`
***
1. Oleh karena kolom **anime_id** pada dataframe `rating_df_lite_version` sama dengan **MAL_ID** pada dataframe `anime_merged` ataupun **anime_with_synopsis**, maka perlu juga dilakukan mapping yang sama dengan kedua dataframe tersebut
2. Encoding kolom **user_id** agar index integer lebih teratur dan memastikan terurut dan tidak acak kode unik yang diinputkan untuk setiap user_id
2. Merge dataframe `rating_df_lite_version` dengan numerical features dari dataframe content-based filtering sebagai fitur tambahan saat training model nantinya
2. Normalisasi kolom **rating** dengan min-max normalization

##### 1. Encoding anime_id = MAL_ID

In [25]:
# encoding MAL_ID dengan data encoding
rating_df_lite_version['anime_id'] = rating_df_lite_version['anime_id'].map(encoding_anime_ids)

In [26]:
rating_df_lite_version.head()

Unnamed: 0,user_id,anime_id,rating
0,1,5211.0,10
1,1,6556.0,10
2,1,6595.0,10
3,1,7245.0,10
4,2,6601.0,10


##### 2. Encoding user_id

In [27]:
user_ids = rating_df_lite_version['user_id'].unique().tolist()
encoding_user_ids = {x: i for i, x in enumerate(user_ids)}

# encoding user_id dengan data encoding
rating_df_lite_version['user_id'] = rating_df_lite_version['user_id'].map(encoding_user_ids)

In [28]:
rating_df_lite_version.head()

Unnamed: 0,user_id,anime_id,rating
0,0,5211.0,10
1,0,6556.0,10
2,0,6595.0,10
3,0,7245.0,10
4,1,6601.0,10


##### 3. Normalisasi kolom rating dengan min-max normalization

In [29]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max normalization to the selected columns
scaled_rating = scaler.fit_transform(rating_df_lite_version[['rating']])
rating_df_lite_version['rating'] = pd.DataFrame(scaled_rating, columns=['rating'])

In [30]:
rating_df_lite_version

Unnamed: 0,user_id,anime_id,rating
0,0,5211.0,1.000000
1,0,6556.0,1.000000
2,0,6595.0,1.000000
3,0,7245.0,1.000000
4,1,6601.0,1.000000
...,...,...,...
6337236,69598,7431.0,0.666667
6337237,69598,7583.0,0.888889
6337238,69598,8581.0,1.000000
6337239,69599,716.0,0.888889


### Penggabungan File

#### untuk Content-based Filtering
***
Hasil feature engineering dari dua dataframe, yaitu `anime_merged` dan `anime_with_synopsis` akan digabungkan untuk tujuan content-based filtering

In [36]:
numerical_scaled_feature_df

Unnamed: 0,MAL_ID,Episodes,year_premiered,start_date_aired,start_year_aired,duration_hours,duration_minutes,duration_seconds
0,0,0.008181,0.616667,0.066667,0.778846,0.0,0.406780,0.0
1,1,0.000000,0.916667,0.000000,0.807692,0.5,0.932203,0.0
2,2,0.008181,0.616667,0.000000,0.778846,0.0,0.406780,0.0
3,3,0.008181,0.683333,0.033333,0.817308,0.0,0.423729,0.0
4,4,0.016688,0.716667,0.966667,0.836538,0.0,0.389831,0.0
...,...,...,...,...,...,...,...,...
12074,12074,0.005563,0.916667,0.266667,0.951923,0.0,0.067797,0.0
12075,12075,0.002618,0.916667,0.800000,0.932692,0.0,0.067797,0.0
12076,12076,0.003599,0.933333,0.300000,0.961538,0.0,0.389831,0.0
12077,12077,0.003599,0.933333,0.266667,0.961538,0.0,0.389831,0.0


In [37]:
content_based_dataframe = pd.concat([
    anime_df_merged[['MAL_ID']],
    ohe_genres_df,
    ohe_licensors_df,
    ohe_producers_df,
    ohe_rating_df,
    ohe_source_df,
    ohe_studios_df,
    ohe_type_df,
    numerical_scaled_feature_df.drop('MAL_ID', axis=1)
],axis=1)

content_based_dataframe = pd.merge(content_based_dataframe, tfidf_df, on='MAL_ID', how='left')

In [38]:
content_based_dataframe

Unnamed: 0,MAL_ID,Genre_Action,Genre_Adventure,Genre_Cars,Genre_Comedy,Genre_Dementia,Genre_Demons,Genre_Drama,Genre_Ecchi,Genre_Fantasy,...,yusaku,yuuhi,yuusei,yuyu,zack,zaizen,zhan,zhou,zhu,zoid
0,0,1,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0,1,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,12074,1,1,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12075,12075,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12076,12076,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12077,12077,0,0,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Penggabungan hasil feature engineering pada dataframe `anime_merged` dengan TF-IDF dari sinopsis dataframe `anime_with_synopsis` menghasilkan 2.706 fitur.
- Tahap reduksi fitur mungkin diperlukan untuk mengurangi **curse of dimensionality** dan mengurangi kompleksitas model pelatihan apabila menggunakan model machine learning (unsupervised)

#### untuk Collaborative filtering
***
Numerical features hasil feature engineering dari dataframe `anime_merged` akan digabungkan dengan feature engineering dari dataframe `rating_df_lite_version`

In [39]:
collaborative_based_dataframe = pd.merge(
    rating_df_lite_version, 
    numerical_scaled_feature_df,
    left_on='anime_id',
    right_on='MAL_ID',
    how='left'
)

In [40]:
# remove anime_id columns bcs it is similar to MAL_ID
collaborative_based_dataframe.drop('anime_id', axis=1, inplace=True)

In [41]:
collaborative_based_dataframe

Unnamed: 0,user_id,rating,MAL_ID,Episodes,year_premiered,start_date_aired,start_year_aired,duration_hours,duration_minutes,duration_seconds
0,0,1.000000,5211.0,0.003599,0.816667,0.133333,0.894231,0.0,0.406780,0.0
1,0,1.000000,6556.0,0.003599,0.850000,0.166667,0.913462,0.0,0.406780,0.0
2,0,1.000000,6595.0,0.007853,0.850000,0.233333,0.913462,0.0,0.389831,0.0
3,0,1.000000,7245.0,0.003599,0.866667,0.200000,0.923077,0.0,0.406780,0.0
4,1,1.000000,6601.0,0.007853,0.850000,0.233333,0.913462,0.0,0.406780,0.0
...,...,...,...,...,...,...,...,...,...,...
6337236,69598,0.666667,7431.0,0.003927,0.866667,0.133333,0.923077,0.0,0.406780,0.0
6337237,69598,0.888889,7583.0,0.000000,0.916667,0.700000,0.923077,0.0,0.983051,0.0
6337238,69598,1.000000,8581.0,0.002945,0.883333,0.333333,0.932692,0.0,0.406780,0.0
6337239,69599,0.888889,716.0,0.007199,0.750000,0.800000,0.855769,0.0,0.423729,0.0


### Data duplicate and handling missing values if any?

#### Check and handling for Content-based dataframe

In [42]:
# check duplicated number of rows
content_based_dataframe[
    content_based_dataframe.duplicated()
]

Unnamed: 0,MAL_ID,Genre_Action,Genre_Adventure,Genre_Cars,Genre_Comedy,Genre_Dementia,Genre_Demons,Genre_Drama,Genre_Ecchi,Genre_Fantasy,...,yusaku,yuuhi,yuusei,yuyu,zack,zaizen,zhan,zhou,zhu,zoid


In [43]:
content_based_dataframe.isna().sum()

MAL_ID                0
Genre_Action          0
Genre_Adventure       0
Genre_Cars            0
Genre_Comedy          0
                   ... 
zaizen             1141
zhan               1141
zhou               1141
zhu                1141
zoid               1141
Length: 2707, dtype: int64

- masih terdapat kolom dengan missing values dari hasil merging kedua dataframe hasil feature engineering. 
- kolom-kolom yang memuat missing values akan dibuang, kurang lebih jumlahnya 1000 kolom (lumayan untuk mereduksi jumlah fitur)

In [44]:
# remove attributes/columns/features with missing values
content_based_dataframe.dropna(axis=1, how='any', inplace=True)

In [45]:
content_based_dataframe.isna().sum()

MAL_ID              0
Genre_Action        0
Genre_Adventure     0
Genre_Cars          0
Genre_Comedy        0
                   ..
start_date_aired    0
start_year_aired    0
duration_hours      0
duration_minutes    0
duration_seconds    0
Length: 1707, dtype: int64

Dari hasil pengecekan dan penanganan, sudah tidak ada data duplikat dan missing values

#### Check and handling for Collaborative-based dataframe

In [46]:
# check duplicate number of rows
collaborative_based_dataframe[
    collaborative_based_dataframe.duplicated()
]

Unnamed: 0,user_id,rating,MAL_ID,Episodes,year_premiered,start_date_aired,start_year_aired,duration_hours,duration_minutes,duration_seconds
44010,535,0.555556,,,,,,,,
226327,2761,0.555556,,,,,,,,
231409,2821,0.666667,,,,,,,,
345845,4165,0.666667,,,,,,,,
416740,4910,0.555556,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
6070206,66256,0.777778,,,,,,,,
6149692,67252,0.444444,,,,,,,,
6149695,67252,0.444444,,,,,,,,
6291366,69139,1.000000,,,,,,,,


- Ada 166 baris data duplikat
- Baris-baris yang demikian akan dihapus dan tidak digunakan dalam proses berikutnya

In [47]:
collaborative_based_dataframe.drop_duplicates(inplace=True)

In [48]:
# check duplicate number of rows
collaborative_based_dataframe[
    collaborative_based_dataframe.duplicated()
]

Unnamed: 0,user_id,rating,MAL_ID,Episodes,year_premiered,start_date_aired,start_year_aired,duration_hours,duration_minutes,duration_seconds


Sudah tidak ada duplikat

In [49]:
# check number of missing values
collaborative_based_dataframe.isna().sum()

user_id                0
rating                 0
MAL_ID              1451
Episodes            1451
year_premiered      1451
start_date_aired    1451
start_year_aired    1451
duration_hours      1451
duration_minutes    1451
duration_seconds    1451
dtype: int64

- Ada sebesar 1.451 baris data yang nilainya tidak ada (missing values)
- Missing values tersebut akan dihapus dari dataframe karena jumlah baris dataframe rating sekitar +-6.3 juta baris data dan dengan menghapus sekitar 1.400 baris tidak menghilangkan cukup banyak informasi

In [50]:
collaborative_based_dataframe.dropna(axis=0, inplace=True, how='any')

In [51]:
# check number of missing values
collaborative_based_dataframe.isna().sum()

user_id             0
rating              0
MAL_ID              0
Episodes            0
year_premiered      0
start_date_aired    0
start_year_aired    0
duration_hours      0
duration_minutes    0
duration_seconds    0
dtype: int64

Setelah dihapus sudah tidak ada lagi missing values

# Simpan Data Hasil Preprocessing dan Preparation

In [52]:
content_based_dataframe.to_csv('./preprocessed_dataset/content_based_dataframe.csv', index=False)
collaborative_based_dataframe.to_csv('./preprocessed_dataset/collaborative_based_dataframe.csv', index=False)