In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation,BatchNormalization,Input,Embedding,Dot,Dense
from tensorflow.keras.callbacks import ModelCheckpoint,LearningRateScheduler,TensorBoard,EarlyStopping

from wordcloud import WordCloud
%matplotlib inline

### Reading ANIMELIST.CSV

In [2]:
import os

In [3]:
INPUT_DIR=os.path.join("..","artifacts","raw")

In [4]:
rating_df=pd.read_csv(INPUT_DIR+"/animelist.csv",low_memory=True,usecols=["user_id","anime_id","rating"])

In [5]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [6]:
rating_df.shape

(7000000, 3)

### DATA PROCESSING

In [7]:
n_ratings=rating_df["user_id"].value_counts()

In [8]:
n_ratings

user_id
20807    17546
11100    14429
22022    13696
16869     8943
10255     8403
         ...  
13606        1
15566        1
9020         1
17067        1
26           1
Name: count, Length: 21103, dtype: int64

In [9]:
rating_df=rating_df[rating_df["user_id"].isin(n_ratings[n_ratings>400].index)].copy()

In [10]:
rating_df

Unnamed: 0,user_id,anime_id,rating
213,2,24833,0
214,2,235,10
215,2,36721,0
216,2,40956,0
217,2,31933,0
...,...,...,...
6999995,22944,4214,0
6999996,22944,2000,0
6999997,22944,64,0
6999998,22944,65,0


In [17]:
min_rating=min(rating_df["rating"])
min_rating

0.0

In [18]:
max_rating=max(rating_df["rating"])
max_rating

1.0

In [19]:
avg_rating=np.mean(rating_df["rating"])
avg_rating

np.float64(0.407261161184904)

In [None]:
#doing scaling for rating column (Min-Max Scaling)
rating_df["rating"]=rating_df["rating"].apply(lambda x: (x-min_rating)/(max_rating-min_rating)).values.astype(np.float64)


In [16]:
np.mean(rating_df["rating"])

np.float64(0.407261161184904)

In [20]:
## Checking duplicates
rating_df.duplicated().sum()

np.int64(0)

In [21]:
### Checking for null values
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [None]:
#User to user encoding for our model
### userid--->10056 ----->97

user_ids=rating_df["user_id"].unique().tolist()
user2user_encoded={x : i for i,x in enumerate(user_ids)}
user2user_decoded={i : x for i,x in enumerate(user_ids)}
rating_df["user"]=rating_df["user_id"].map(user2user_encoded) #creates another oclumn of user encoded for machine

In [31]:
rating_df

Unnamed: 0,user_id,anime_id,rating,user
213,2,24833,0.0,0
214,2,235,1.0,0
215,2,36721,0.0,0
216,2,40956,0.0,0
217,2,31933,0.0,0
...,...,...,...,...
6999995,22944,4214,0.0,5828
6999996,22944,2000,0.0,5828
6999997,22944,64,0.0,5828
6999998,22944,65,0.0,5828


In [34]:
n_users=len(user2user_encoded)
n_users

5829

In [35]:
## Doing same for anime_id column


anime_ids=rating_df["anime_id"].unique().tolist()
anime2anime_encoded={x : i for i,x in enumerate(anime_ids)}
anime2anime_decoded={i : x for i,x in enumerate(anime_ids)}
rating_df["anime"]=rating_df["anime_id"].map(anime2anime_encoded) #creates another oclumn of user encoded for machine

In [36]:
rating_df

Unnamed: 0,user_id,anime_id,rating,user,anime
213,2,24833,0.0,0,0
214,2,235,1.0,0,1
215,2,36721,0.0,0,2
216,2,40956,0.0,0,3
217,2,31933,0.0,0,4
...,...,...,...,...,...
6999995,22944,4214,0.0,5828,1574
6999996,22944,2000,0.0,5828,5417
6999997,22944,64,0.0,5828,727
6999998,22944,65,0.0,5828,730


In [39]:
n_anime=len(anime2anime_encoded)
n_anime

17553