In [111]:
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.metrics.pairwise import linear_kernel



pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


import warnings
warnings.filterwarnings("ignore")


In [219]:
df = pd.read_csv('../Data/animelist.csv', usecols=["user_id", "anime_id", "rating"])
df = df[df['rating']>=1]
print(df.shape)
df.head()

(62397712, 3)


Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
4,0,21,10
5,0,24,9


In [220]:
f = ['count','mean']

df_movie_summary = df.groupby('anime_id')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.9),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Number of times an Anime must be rated to be considered is: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('user_id')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Number of times a user has to rate an anime to be considered: {}'.format(cust_benchmark))

Number of times an Anime must be rated to be considered is: 9161.0
Number of times a user has to rate an anime to be considered: 220.0


In [221]:
print('Original Shape: {}'.format(df.shape))
shape = df.shape
df = df[~df['anime_id'].isin(drop_movie_list)]
df = df[~df['user_id'].isin(drop_cust_list)]
print('New Dataframe shape after trimming: {}'.format(df.shape))
print("Size reduced by: {:.2f}%".format((1-(df.shape[0]/shape[0]))*100))

Original Shape: (62397712, 3)
New Dataframe shape after trimming: (31363358, 3)
Size reduced by: 49.74%


In [222]:
req_anime_df  =(pd.read_csv("../Data/first_anime_thumbnails_list.csv"))
req_anime = req_anime_df["Name"].values.tolist()

In [223]:
df_title = pd.read_csv("../Data/anime_with_synopsis.csv", usecols=["MAL_ID","Genres", "Name","sypnopsis"])
df_title.rename(columns={"sypnopsis":"synopsis"}, inplace=True)
df_title['Name'] = df_title['Name'].apply(lambda x: x.replace(":"," "))
df_title['Name'] = df_title['Name'].apply(lambda x: x.replace("/"," "))
df_title['Name'] = df_title['Name'].apply(lambda x: x.replace("?"," "))



df_title.head()

Unnamed: 0,MAL_ID,Name,Genres,synopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [224]:
df_agg = df.groupby('anime_id').agg({
    'rating': ['mean', 'count', lambda x: x[x >= x.quantile(0.3)].mean()]
})

# Renaming the columns
df_agg.columns = ['average_rating', 'total_ratings', 'average_rating_1']

# Resetting the index
df_agg = df_agg.reset_index()

df_agg.head()


Unnamed: 0,anime_id,average_rating,total_ratings,average_rating_1
0,1,8.54649,43999,9.038492
1,5,8.239058,20334,8.745898
2,6,8.052525,29205,8.742811
3,7,7.141828,7777,7.866894
4,15,7.804818,10503,8.228562


In [225]:
total_df = df_agg.merge(df_title, left_on='anime_id', right_on='MAL_ID', how='left')
total_df.drop('MAL_ID', axis=1, inplace=True)
total_df = total_df[total_df['average_rating']>=1]
total_df.head()

Unnamed: 0,anime_id,average_rating,total_ratings,average_rating_1,Name,Genres,synopsis
0,1,8.54649,43999,9.038492,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,8.239058,20334,8.745898,Cowboy Bebop Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,8.052525,29205,8.742811,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,7.141828,7777,7.866894,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,15,7.804818,10503,8.228562,Eyeshield 21,"Action, Sports, Comedy, Shounen",Sena is like any other shy kid starting high s...


In [226]:
# drop_list = ['Code Geass: Hangyaku no Lelouch R2',
#  'Fullmetal Alchemist: Brotherhood',
#  'Monogatari Series: Second Season']

# dict = pd.read_pickle("web_app/home_page_anime_genre_2.pkl")

# for key in dict:
#     dict[key] = [value for value in dict[key] if value not in drop_list]

# print(dict)

# with open("web_app/home_page_anime_genre_2.pkl", "wb") as f:
#     pickle.dump(dict, f)

In [227]:
anime = pd.read_csv('../Data/anime.csv')    
anime['English name'] = anime['English name'].apply(lambda x: x.replace(":"," "))
anime['English name'] = anime['English name'].apply(lambda x: x.replace("/"," "))
anime['English name'] = anime['English name'].apply(lambda x: x.replace("?"," "))


print(anime.shape)
anime.head()

(17562, 35)


Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,1467,94683,587,4300,46165,5121,5378,33719,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,4369,13224,18,642,7314,766,1108,3394,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [228]:

anime['Rating'].value_counts()

Rating
PG-13 - Teens 13 or older         6132
G - All Ages                      5782
PG - Children                     1461
Rx - Hentai                       1345
R - 17+ (violence & profanity)    1157
R+ - Mild Nudity                   997
Unknown                            688
Name: count, dtype: int64

In [229]:
total_df = total_df.merge(anime[["MAL_ID","Episodes","Members","Premiered","Favorites","Duration", "Type"]], left_on='anime_id', right_on='MAL_ID', how='left')
total_df.drop('MAL_ID', axis=1, inplace=True)
total_df['Episodes'].replace("Unknown", 0, inplace=True)
total_df['Episodes'] = total_df['Episodes'].astype(int)
total_df.head()

Unnamed: 0,anime_id,average_rating,total_ratings,average_rating_1,Name,Genres,synopsis,Episodes,Members,Premiered,Favorites,Duration,Type
0,1,8.54649,43999,9.038492,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",26,1251960,Spring 1998,61971,24 min. per ep.,TV
1,5,8.239058,20334,8.745898,Cowboy Bebop Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",1,273145,Unknown,1174,1 hr. 55 min.,Movie
2,6,8.052525,29205,8.742811,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",26,558913,Spring 1998,12944,24 min. per ep.,TV
3,7,7.141828,7777,7.866894,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,26,94683,Summer 2002,587,25 min. per ep.,TV
4,15,7.804818,10503,8.228562,Eyeshield 21,"Action, Sports, Comedy, Shounen",Sena is like any other shy kid starting high s...,145,148259,Spring 2005,2066,23 min. per ep.,TV
5,16,7.968301,15521,8.43097,Hachimitsu to Clover,"Comedy, Drama, Josei, Romance, Slice of Life","Yuuta Takemoto, a sophomore at an arts college...",24,214499,Spring 2005,4101,23 min. per ep.,TV
6,18,8.0386,8912,8.661025,Initial D Fourth Stage,"Action, Cars, Sports, Drama, Seinen",Takumi Fujiwara finally joins Ryousuke and Kei...,24,117929,Spring 2004,979,27 min. per ep.,TV
7,19,8.660315,23142,9.158628,Monster,"Drama, Horror, Mystery, Police, Psychological,...","Dr. Kenzou Tenma, an elite neurosurgeon recent...",74,614100,Spring 2004,29436,24 min. per ep.,TV
8,20,7.427903,61091,8.112911,Naruto,"Action, Adventure, Comedy, Super Power, Martia...","oments prior to Naruto Uzumaki's birth, a huge...",220,1830540,Fall 2002,65586,23 min. per ep.,TV
9,21,8.308085,38366,9.165946,One Piece,"Action, Adventure, Comedy, Super Power, Drama,...","Gol D. Roger was known as the ""Pirate King,"" t...",0,1352724,Fall 1999,126645,24 min.,TV


In [179]:
# only_front_end_anime = pd.DataFrame({"only_front_end_anime":total_df['Name'].values.tolist()})
# only_front_end_anime.to_csv("../Data/only_front_end_anime.csv", index=False)
# total_df.to_csv("../src/web_app/front_end_data.csv", index=False)


In [230]:
total_df.shape

(1718, 13)

In [181]:
# anime_dict = pd.read_pickle("web_app/home_page_anime_genre_2.pkl")
# genres_lst = ['Vampire', "Military" , "Game", "Mecha"]
# anime_lst = []

# for genre in genres_lst:
#     animes = anime_dict[genre]
#     for anime in animes:
#         anime_lst.append(anime)
# anime_lst  

In [182]:
# print(len(anime_lst))
# set(anime_lst) - set(total_df[total_df['Name'].isin(anime_lst)]['Name'].values.tolist())

In [183]:
temp_df = total_df[total_df['total_ratings']>30000]
temp_df.reset_index(inplace=True, drop=True)
temp_df.shape

(249, 18)

In [184]:
temp_df['Genres'] = temp_df['Genres'].apply(lambda x: x.split(', '))
exploded_df = temp_df.explode('Genres')

In [185]:
# genre_emoji = {"Comedy":"Comedy :emoji-laughing:","Mecha" : "Mecha :robot:", "Game" : "Game :controller:", "Military": "Military :gun:",
#  "Vampire" : "Vampire :vampire:", "Shounen" : "Shounen :boy:", "Psychological" : "Psychological :brain:", "Magic" : "Magic :magic_wand:",
#  "Slice of Life" : "Slice of Life :cake:", "Drama" : "Drama :film_projector:", "Supernatural" : "Supernatural :ghost:", 
#  "Mystery" : "Mystery :male-detective:", "School" : "School :school:", "Romance" : "Romance :sparkling_heart:",
#  "Historical" : "Historical :hourglass:", "Horror" : "Horror :scream:", "Sports" : "Sports :soccer:", "Sci-Fi" : "Sci-Fi :rocket:",
#  "Adventure" : "Adventure :sunrise_over_mountains:", "Fantasy" : "Fantasy :unicorn_face:", "Action" : "Action :collision:"}

In [186]:
# genre_list = ["Vampire","Military",'Game',"Mecha"]
# genre_wise_anime = {}
# for genre in genre_list:
#     temp_df = exploded_df[exploded_df['Genres'] == genre]
#     temp_df.sort_values(by = 'Members', ascending= False, inplace=True)
#     anime_list = list(temp_df['Name'][:15])
#     genre_wise_anime[genre] = anime_list

In [187]:
# dict_2 = {
#  'Mystery': ['Death Note', 'Shingeki no Kyojin', 'Tokyo Ghoul', 'Shingeki no Kyojin Season 2', 'Mirai Nikki'],
#  'Supernatural': ['One Punch Man', 'No Game No Life', 'Angel Beats!', 'Noragami', 'Kimetsu no Yaiba'],
#  'Drama': ['Fullmetal Alchemist  Brotherhood', 'Kimi no Na wa.', 'Code Geass  Hangyaku no Lelouch', 'Re Zero kara Hajimeru Isekai Seikatsu', 'Shigatsu wa Kimi no Uso'],
#  'Slice of Life': ['Toradora!', 'Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.', 'Clannad', 'Violet Evergarden', 'Yahari Ore no Seishun Love Comedy wa Machigatteiru.'],
#  'Magic': ['Nanatsu no Taizai', 'Fairy Tail', 'Kono Subarashii Sekai ni Shukufuku wo!', 'Overlord', 'Fate Zero'],
#  'Psychological': ['Boku dake ga Inai Machi', 'Kiseijuu  Sei no Kakuritsu', 'Death Parade', 'Psycho-Pass', 'Elfen Lied'],
#  'Shounen': ['Boku no Hero Academia', 'Naruto', 'Hunter x Hunter (2011)', 'Boku no Hero Academia 2nd Season', 'Naruto  Shippuuden']
# }

# best_animes = {
#     'Vampire': ['Bakemonogatari', 'Hellsing Ultimate', 'Monogatari Series: Second Season', 'Shiki', 'Blood Lad'],
#     'Military': ['Fullmetal Alchemist: Brotherhood', 'Code Geass: Hangyaku no Lelouch R2', 'Shingeki no Kyojin Season 3 Part 2', 'Youjo Senki', 'Akira'],
#     'Game': ['No Game No Life', 'Death Parade', 'Overlord', 'Log Horizon', 'Kakegurui'],
#     'Mecha': ['Tengen Toppa Gurren Lagann', 'Neon Genesis Evangelion', 'Darling in the FranXX', 'FLCL', 'Full Metal Panic!']
# }



In [188]:
# import os
# current_path = os.getcwd()
# anime_dict = pd.read_pickle("../src/web_app/home_page_anime_genre_2.pkl")
# anime_dict.update(best_animes)
# "web_app/home_page_anime_genre_2.pkl"
# with open("web_app/home_page_anime_genre_2.pkl", "wb") as f:
#     pickle.dump(anime_dict, f)

In [189]:
# model_anime_images = list(temp_df['Name'].unique())
# import os
# path = "../src/anime_images/"
# downloaded_anime_images = os.listdir(path)
# cleaned_downloaded_anime_images = [anime_image.split('.')[0].replace('_', ' ') for anime_image in downloaded_anime_images]
# print("currently downloaded {} anime images".format(len(cleaned_downloaded_anime_images)))

In [190]:
# final_anime_download_list = list(set(model_anime_images) - set(cleaned_downloaded_anime_images))
# final_anime_download_df = pd.DataFrame({"Name":final_anime_download_list})
# final_anime_download_df.to_csv("../Data/final_anime_download.csv", index=False)

In [191]:
# anime_dict = pd.read_pickle("../src/web_app/home_page_anime_genre_2.pkl")
# anime_list = []
# for key in anime_dict.keys():
#     animes = anime_dict[key]
#     for anime in animes:
#         if anime not in anime_list:
#             anime_list.append(anime)

In [192]:
# import os
# path = "../src/anime_images/"
# downloaded_anime_images = os.listdir(path)
# cleaned_downloaded_anime_images = [anime_image.split('.')[0].replace('_', ' ') for anime_image in downloaded_anime_images]

In [193]:
# # animes to download

# set(anime_list) - set(cleaned_downloaded_anime_images)

In [194]:
tfidf = TfidfVectorizer(stop_words='english')

temp_df['synopsis'] = temp_df['synopsis'].fillna('')
tfidf_matrix = tfidf.fit_transform(temp_df['synopsis'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [195]:
#Testing

indices = pd.Series(temp_df.index, index=temp_df['Name'])


def get_weighted_recommendations(user_animes, user_ratings, cosine_sim=cosine_sim, num_recommendations=10):
    # Start with a zero matrix
    accumulator = np.zeros(cosine_sim.shape[0])
    weight_sum = np.zeros(cosine_sim.shape[0])

    # Add weighted similarity matrices
    for anime, rating in zip(user_animes, user_ratings):
        idx = indices[anime]
        accumulator += cosine_sim[idx] * rating
        weight_sum += cosine_sim[idx]

    # Normalize the accumulator
    mean_scores = np.where(weight_sum != 0, accumulator / weight_sum, 0)

    # Get recommendations based on the accumulated matrix
    sim_scores = [(i, mean_scores[i]) for i in range(len(mean_scores))]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Considering the top 30 animes with the highest similarity score
    sim_scores = sim_scores[1:31]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Fetch the animes, excluding the ones that the user has already rated
    animes = temp_df.iloc[anime_indices].copy()
    animes = animes[~animes['Name'].isin(user_animes)]

    # Filter out animes with average rating less than 5
    animes = animes[animes['average_rating_1'] >= 5]
    
    # Sort the animes by 'total_ratings' and get the top 'num_recommendations' animes
    top_animes = animes.sort_values(by='average_rating_1', ascending=False)[:num_recommendations]

    return top_animes['Name']


def recommend_to_new_user():
    popular_animes = temp_df.sort_values(by='total_ratings', ascending=False)['Name'].values[:30]
    rated_animes = []
    rated_scores = []

    for i, anime in enumerate(popular_animes):
        print(f"Please rate the anime '{anime}' on a scale of 1-10 or type 'n' if you haven't watched:")
        rating = input()
        
        # Check if user hasn't watched the anime
        if rating.lower() == 'n':
            continue
        
        # Check if user provided a valid rating
        while not rating.isdigit() or int(rating) < 1 or int(rating) > 10:
            print("Invalid rating. Please rate again.")
            rating = input()
        
        rated_animes.append(anime)
        rated_scores.append(int(rating))
        
        if len(rated_scores) >= 5:  # If user has rated at least 4 animes, we stop asking for more ratings
            break
        elif i == len(popular_animes) - 1:
            break
            
    
    # Get recommendations based on all user ratings
    recommendations = list(get_weighted_recommendations(rated_animes, rated_scores))

    return recommendations


In [196]:
# recommendations = recommend_to_new_user()

# # total_df[total_df['Name'].isin(recommendations)]
# recommendations

In [197]:
# # If user's input is a synopsis
# user_input = " a virtual reality game, traps its players where they must complete. However,if player dies in videogame, they die in real life."
# user_vec = tfidf.transform([user_input])

# # Compute cosine similarities between user's input and all synopses
# cosine_similarities = linear_kernel(user_vec, tfidf_matrix).flatten()

# # Get the top 10 most similar anime
# related_anime_indices = cosine_similarities.argsort()[:-11:-1]
# related_anime = temp_df['Name'].iloc[related_anime_indices]
# print(related_anime)

In [198]:
temp_df.shape

(249, 18)

In [199]:
temp_df.head()

Unnamed: 0,anime_id,average_rating,total_ratings,average_rating_1,Name,Genres,synopsis,Episodes_x,Members_x,Premiered_x,Favorites_x,Duration_x,Episodes_y,Members_y,Premiered_y,Favorites_y,Duration_y,Type
0,1,8.54649,43999,9.038492,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi, Space]","In the year 2071, humanity has colonized sever...",26,1251960,Spring 1998,61971,24 min. per ep.,26,1251960,Spring 1998,61971,24 min. per ep.,TV
1,20,7.427903,61091,8.112911,Naruto,"[Action, Adventure, Comedy, Super Power, Marti...","oments prior to Naruto Uzumaki's birth, a huge...",220,1830540,Fall 2002,65586,23 min. per ep.,220,1830540,Fall 2002,65586,23 min. per ep.,TV
2,21,8.308085,38366,9.165946,One Piece,"[Action, Adventure, Comedy, Super Power, Drama...","Gol D. Roger was known as the ""Pirate King,"" t...",0,1352724,Fall 1999,126645,24 min.,Unknown,1352724,Fall 1999,126645,24 min.,TV
3,30,8.092233,50871,8.913394,Neon Genesis Evangelion,"[Action, Sci-Fi, Dementia, Psychological, Dram...","In the year 2015, the world stands on the brin...",26,1160651,Fall 1995,71308,24 min. per ep.,26,1160651,Fall 1995,71308,24 min. per ep.,TV
4,32,8.265757,34317,9.01696,Neon Genesis Evangelion The End of Evangelion,"[Sci-Fi, Dementia, Psychological, Drama, Mecha]","h the final Angel vanquished, Nerv has one las...",1,572080,Unknown,17811,1 hr. 27 min.,1,572080,Unknown,17811,1 hr. 27 min.,Movie


In [201]:
# import plotly.graph_objs as go
# from sklearn.decomposition import PCA
# from gensim.models import Word2Vec
# from nltk.tokenize import word_tokenize

# # tokenize synopsis
# temp_df['synopsis_token'] = temp_df['synopsis'].apply(word_tokenize)

# # train word2vec model
# model = Word2Vec(temp_df['synopsis_token'], min_count=1)

# # get vector representation of each synopsis
# temp_df['synopsis_vector'] = temp_df['synopsis_token'].apply(lambda x: np.mean([model.wv[word] for word in x if word in model.wv], axis=0))

# # handling the case where a synopsis may not contain any words found in the word2vec model
# temp_df = temp_df.dropna(subset=['synopsis_vector'])

# # PCA to reduce to 2 dimensions
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(temp_df['synopsis_vector'].tolist())

# # Interactive Plot
# trace = go.Scatter(
#     x = pca_result[:,0],
#     y = pca_result[:,1],
#     mode = 'markers'
# )

# layout = go.Layout(
#     title = 'PCA for Synopsis',
#     hovermode = 'closest'
# )

# fig = go.Figure(data = [trace], layout = layout)
# fig.show()


In [213]:
total_df.head()

Unnamed: 0,anime_id,average_rating,total_ratings,average_rating_1,Name,Genres,synopsis,Episodes_x,Members_x,Premiered_x,Favorites_x,Duration_x,Episodes_y,Members_y,Premiered_y,Favorites_y,Duration_y,Type
0,1,8.54649,43999,9.038492,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",26,1251960,Spring 1998,61971,24 min. per ep.,26,1251960,Spring 1998,61971,24 min. per ep.,TV
1,5,8.239058,20334,8.745898,Cowboy Bebop Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",1,273145,Unknown,1174,1 hr. 55 min.,1,273145,Unknown,1174,1 hr. 55 min.,Movie
2,6,8.052525,29205,8.742811,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",26,558913,Spring 1998,12944,24 min. per ep.,26,558913,Spring 1998,12944,24 min. per ep.,TV
3,7,7.141828,7777,7.866894,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,26,94683,Summer 2002,587,25 min. per ep.,26,94683,Summer 2002,587,25 min. per ep.,TV
4,15,7.804818,10503,8.228562,Eyeshield 21,"Action, Sports, Comedy, Shounen",Sena is like any other shy kid starting high s...,145,148259,Spring 2005,2066,23 min. per ep.,145,148259,Spring 2005,2066,23 min. per ep.,TV


In [214]:
total_df['rounded_rating'] = total_df['average_rating'].apply(lambda x: np.round(x,0))

In [216]:
total_df.head()

Unnamed: 0,anime_id,average_rating,total_ratings,average_rating_1,Name,Genres,synopsis,Episodes_x,Members_x,Premiered_x,Favorites_x,Duration_x,Episodes_y,Members_y,Premiered_y,Favorites_y,Duration_y,Type,rounded_rating
0,1,8.54649,43999,9.038492,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",26,1251960,Spring 1998,61971,24 min. per ep.,26,1251960,Spring 1998,61971,24 min. per ep.,TV,9.0
1,5,8.239058,20334,8.745898,Cowboy Bebop Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",1,273145,Unknown,1174,1 hr. 55 min.,1,273145,Unknown,1174,1 hr. 55 min.,Movie,8.0
2,6,8.052525,29205,8.742811,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",26,558913,Spring 1998,12944,24 min. per ep.,26,558913,Spring 1998,12944,24 min. per ep.,TV,8.0
3,7,7.141828,7777,7.866894,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,26,94683,Summer 2002,587,25 min. per ep.,26,94683,Summer 2002,587,25 min. per ep.,TV,7.0
4,15,7.804818,10503,8.228562,Eyeshield 21,"Action, Sports, Comedy, Shounen",Sena is like any other shy kid starting high s...,145,148259,Spring 2005,2066,23 min. per ep.,145,148259,Spring 2005,2066,23 min. per ep.,TV,8.0


In [218]:
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import plotly.express as px

# replace NaN values in 'synopsis' column with an empty string
total_df['synopsis'].fillna("", inplace=True)

# tokenize synopsis
total_df['synopsis_token'] = total_df['synopsis'].apply(word_tokenize)

# train word2vec model
model = Word2Vec(total_df['synopsis_token'], min_count=1)

# get vector representation of each synopsis
total_df['synopsis_vector'] = total_df['synopsis_token'].apply(lambda x: np.mean([model.wv[word] for word in x if word in model.wv], axis=0))

# handling the case where a synopsis may not contain any words found in the word2vec model
total_df = total_df.dropna(subset=['synopsis_vector'])

# Convert the 'Type' column to category type if it's not
if total_df['rounded_rating'].dtype != 'category':
    total_df['rounded_rating'] = total_df['rounded_rating'].astype('category')

# PCA to reduce to 3 dimensions
pca = PCA(n_components=3)
pca_result = pca.fit_transform(np.stack(total_df['synopsis_vector'].values))

fig = px.scatter_3d(x=pca_result[:,0], y=pca_result[:,1], z=pca_result[:,2], 
                    color=total_df['rounded_rating'], hover_name=total_df['Name'])
fig.show()


In [211]:
temp_df['rounded_rating'].value_counts()

rounded_rating
8.0     138
9.0      97
7.0       9
10.0      5
Name: count, dtype: int64

In [310]:

set(anime_list_thumbnail) - set(new_list)

{'Name'}