In [1]:
from google.colab import files
uploaded = files.upload()



Saving kaggle.json to kaggle.json


In [2]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!kaggle datasets download -d arnavvvvv/anime-dataset


Downloading anime-dataset.zip to /content
 89% 192M/217M [00:01<00:00, 123MB/s]
100% 217M/217M [00:01<00:00, 153MB/s]


In [4]:
!unzip -q anime-dataset.zip


### 1. IMPORTING LIBRARIES AND DATA

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [112]:
df = pd.read_csv("/content/animes.csv")

In [113]:
df.head(1)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...


In [114]:
df.shape

(19311, 12)

In [115]:
df.isnull().sum()

uid              0
title            0
synopsis       975
genre            0
aired            0
episodes       706
members          0
popularity       0
ranked        3212
score          579
img_url        180
link             0
dtype: int64

In [116]:
df.duplicated().sum()

2943

In [117]:
df.drop_duplicates(inplace=True)

In [118]:
df.dropna(inplace=True)

In [119]:
df.shape

(13731, 12)

In [120]:
df.isnull().sum()

uid           0
title         0
synopsis      0
genre         0
aired         0
episodes      0
members       0
popularity    0
ranked        0
score         0
img_url       0
link          0
dtype: int64

### 2. Dropping IRRELEVANT COLUMNS

##### NOT RELEVANT COLUMNS
- 1. aired
- 2. episodes
- 3. members
- 4. popularity
- 5. ranked
- 6. score

In [121]:
columns_to_drop = ['aired', 'episodes', 'members','popularity', 'ranked', 'score',]

In [122]:
new_df = df.drop(columns=columns_to_drop)

In [123]:
new_df.head(1)

Unnamed: 0,uid,title,synopsis,genre,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...


### 3. EXPLORING GENRE COLUMN

In [124]:
new_df.iloc[0]['genre']

"['Comedy', 'Sports', 'Drama', 'School', 'Shounen']"

In [125]:
import ast

def convert_str_to_list(genre_str):
    try:
        # Using the ast module to safely evaluate the string as a literal expression
        genre_list = ast.literal_eval(genre_str)
        if isinstance(genre_list, list):
            return genre_list
        else:
            return []
    except (SyntaxError, ValueError):
        return []

# Assuming 'genre' is the name of your column
new_df['genre'] = new_df['genre'].apply(convert_str_to_list)


In [126]:
type(new_df.iloc[0]['genre'])

list

In [127]:
new_df.head(1)

Unnamed: 0,uid,title,synopsis,genre,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"[Comedy, Sports, Drama, School, Shounen]",https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...


### 4. EXPLORING SYNOPSIS

In [128]:
new_df.iloc[0]['synopsis']

"Following their participation at the Inter-High, the Karasuno High School volleyball team attempts to refocus their efforts, aiming to conquer the Spring tournament instead.  \r\n \r\nWhen they receive an invitation from long-standing rival Nekoma High, Karasuno agrees to take part in a large training camp alongside many notable volleyball teams in Tokyo and even some national level players. By playing with some of the toughest teams in Japan, they hope not only to sharpen their skills, but also come up with new attacks that would strengthen them. Moreover, Hinata and Kageyama attempt to devise a more powerful weapon, one that could possibly break the sturdiest of blocks.  \r\n \r\nFacing what may be their last chance at victory before the senior players graduate, the members of Karasuno's volleyball team must learn to settle their differences and train harder than ever if they hope to overcome formidable opponents old and new—including their archrival Aoba Jousai and its world-class 

In [129]:
# Convert string to list
new_df['synopsis'] = new_df['synopsis'].apply(lambda x:x.split())


In [130]:
type(new_df.iloc[0]['synopsis'])

list

### 5. SOME DATA PREPROCESSING

In [131]:
new_df["tags"] = new_df["genre"]+new_df["synopsis"]

In [132]:
new_df.head(1)

Unnamed: 0,uid,title,synopsis,genre,img_url,link,tags
0,28891,Haikyuu!! Second Season,"[Following, their, participation, at, the, Int...","[Comedy, Sports, Drama, School, Shounen]",https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...,"[Comedy, Sports, Drama, School, Shounen, Follo..."


### 6. MAKE A NEW DATASET WITH ONLY IMPORTANT COLUMNS

In [133]:
animes = new_df[['uid', 'title','img_url', 'link', 'tags']]

In [134]:
animes.head(1)

Unnamed: 0,uid,title,img_url,link,tags
0,28891,Haikyuu!! Second Season,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...,"[Comedy, Sports, Drama, School, Shounen, Follo..."


### 7. DOING SOME NLP TASKS

In [135]:
animes['tags'] = animes['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  animes['tags'] = animes['tags'].apply(lambda x: " ".join(x))


In [136]:
animes.head(1)

Unnamed: 0,uid,title,img_url,link,tags
0,28891,Haikyuu!! Second Season,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...,Comedy Sports Drama School Shounen Following t...


In [137]:
animes['tags'] = animes['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  animes['tags'] = animes['tags'].apply(lambda x:x.lower())


In [138]:
animes.head(1)

Unnamed: 0,uid,title,img_url,link,tags
0,28891,Haikyuu!! Second Season,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...,comedy sports drama school shounen following t...


In [139]:
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [140]:
ps = PorterStemmer()
def stems(text):
    T = []
    for i in text.split():
        T.append(ps.stem(i))

    return " ".join(T)

In [141]:
animes['tags'] = animes['tags'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  animes['tags'] = animes['tags'].apply(stems)


In [142]:
# Converting words to their vector form
cv = CountVectorizer(max_features=5000,stop_words='english')
vector = cv.fit_transform(animes['tags']).toarray()

In [143]:
vector.shape

(13731, 5000)

In [144]:
similarity = cosine_similarity(vector)
similarity.shape

(13731, 13731)

### 8. PREDICTION FUNCTION

In [145]:
def recommend(anime):
    index = animes[animes['title'] == anime].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(animes.iloc[i[0]].title)

In [146]:
recommend('One Punch Man')

One Punch Man: Road to Hero
One Punch Man 2nd Season
Boku no Hero Academia
Bakuman.
Yuusha ni Narenakatta Ore wa Shibushibu Shuushoku wo Ketsui Shimashita.


### 9. SAVING

In [147]:
with open('/content/artifacts/animes.pkl', 'wb') as file:
    pickle.dump(animes, file)

# Save similarity to a file named 'similarity.pkl'
with open('/content/artifacts/similarity.pkl', 'wb') as file:
    pickle.dump(similarity, file)

In [148]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [151]:
import shutil

# Assuming 'file_path' is the path of your file in Colab
file_path = '/content/artifacts/similarity.pkl'

# Specify the destination path on Google Drive
drive_path = '/content/drive/My Drive/similarity.pkl'

# Copy the file to Google Drive
shutil.copyfile(file_path, drive_path)


'/content/drive/My Drive/similarity.pkl'