# 1) library & import files

In [2]:
import pandas as pd
import numpy as np
import re
import spacy
import en_core_web_sm   # english core text
from spacy.lang.en.stop_words import STOP_WORDS ## Import stop words from spacy 

# Tfidf transformer 
from sklearn.feature_extraction.text import TfidfVectorizer


#model
from sklearn.cluster import DBSCAN


#Display all columns
pd.options.display.max_columns = None

In [50]:
data = pd.read_csv('../data/animes_clean.csv')
data.drop("Unnamed: 0", axis=1, inplace=True)
data.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


# 2) preprocess

In [51]:
# columns
df = data.iloc[:, :4]
df.head()

Unnamed: 0,uid,title,synopsis,genre
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun..."
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F..."
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ..."
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']"


In [52]:
# remove special character
df['clean_synopsis'] = df['synopsis'].apply(lambda doc : re.sub("[^A-Za-z]+", " ", str(doc)).lower())

In [53]:
df['clean_synopsis'].head()

0    following their participation at the inter hig...
1    music accompanies the path of the human metron...
2    the abyss a gaping chasm stretching down into ...
3     in order for something to be obtained somethi...
4    after helping revive the legendary vampire kis...
Name: clean_synopsis, dtype: object

In [54]:
nlp = en_core_web_sm.load()

## Tokenize the cleaned document
tokenized_doc = df['clean_synopsis'].fillna('').apply(lambda x: nlp(x))

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [token.lemma_ for token in x if token.text not in STOP_WORDS])
tokenized_doc

0        [follow, participation, inter, high, karasuno,...
1        [music, accompany, path, human, metronome, pro...
2        [abyss, gape, chasm, stretch, depth, earth, fi...
3        [ , order, obtain, equal, value, lose, alchemy...
4        [helping, revive, legendary, vampire, kiss, sh...
                               ...                        
16211    [new, animation, offer, uniqlo, clothing, stor...
16212    [high, school, student, sora, kashiwagi, accus...
16213    [regain, squid, like, ability, ika, musume, pl...
16214    [year, niflheim, empire, kingdom, lucis, war, ...
16215    [yuuta, togashi, rikka, takanashi, start, date...
Name: clean_synopsis, Length: 16216, dtype: object

In [55]:
# add a clean token column 
df["clean_token"] = [" ".join(x) for x in tokenized_doc]
df.head()

Unnamed: 0,uid,title,synopsis,genre,clean_synopsis,clean_token
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",following their participation at the inter hig...,follow participation inter high karasuno high ...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",music accompanies the path of the human metron...,music accompany path human metronome prodigiou...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",the abyss a gaping chasm stretching down into ...,abyss gape chasm stretch depth earth fill myst...
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",in order for something to be obtained somethi...,order obtain equal value lose alchemy bind l...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",after helping revive the legendary vampire kis...,helping revive legendary vampire kiss shot ace...


In [61]:
df.to_json('../data/synopsis_tokenized.json',index=False )

In [3]:
df = pd.read_json('../data/synopsis_tokenized.json')

In [4]:
df.head()

Unnamed: 0,uid,title,synopsis,genre,clean_synopsis,clean_token
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...",following their participation at the inter hig...,follow participation inter high karasuno high ...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...",music accompanies the path of the human metron...,music accompany path human metronome prodigiou...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...",the abyss a gaping chasm stretching down into ...,abyss gape chasm stretch depth earth fill myst...
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...",in order for something to be obtained somethi...,order obtain equal value lose alchemy bind l...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']",after helping revive the legendary vampire kis...,helping revive legendary vampire kiss shot ace...


In [5]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_token'])

# X is a generator. We can transform that as an array
dense = X.toarray()
dense

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
# Get tokens for all documents
vectorizer.vocabulary_

{'follow': 9675,
 'participation': 23551,
 'inter': 14255,
 'high': 12616,
 'karasuno': 15748,
 'school': 27550,
 'volleyball': 34315,
 'team': 31423,
 'attempt': 2021,
 'refocus': 25675,
 'effort': 8046,
 'aim': 506,
 'conquer': 5495,
 'spring': 29839,
 'tournament': 32451,
 'instead': 14204,
 'receive': 25548,
 'invitation': 14405,
 'long': 18162,
 'stand': 29937,
 'rival': 26396,
 'nekoma': 21562,
 'agree': 447,
 'large': 17603,
 'training': 32528,
 'camp': 3987,
 'alongside': 852,
 'notable': 22120,
 'tokyo': 32151,
 'national': 21389,
 'level': 17830,
 'player': 24179,
 'play': 24175,
 'tough': 32418,
 'japan': 14779,
 'hope': 13050,
 'sharpen': 28221,
 'skill': 29165,
 'come': 5250,
 'new': 21676,
 'attack': 2012,
 'strengthen': 30196,
 'hinata': 12733,
 'kageyama': 15361,
 'devise': 6942,
 'powerful': 24515,
 'weapon': 34612,
 'possibly': 24469,
 'break': 3517,
 'sturdy': 30272,
 'block': 3136,
 'face': 9086,
 'chance': 4422,
 'victory': 34153,
 'senior': 27913,
 'graduate': 112

In [None]:
# Create a tf-idf matrix for each token and each document
tfid_df = pd.DataFrame(dense, 
             columns=[x for x in vectorizer.get_feature_names_out()], 
             index=["doc_{}".format(x) for x in range(0, df.shape[0])] )

tfid_df

# 3) clustering

In [None]:
# Instanciate DBSCAN
db = DBSCAN(eps=0.5, min_samples=15, metric="cosine")
db.fit(tfid_df)

NameError: name 'DBSCAN' is not defined

# LOT OF CRASH DURING DBSCAN. Switch to transformer model

In [12]:
def auto_tune_dbscan(X, eps_range, min_samples_range, metric):
    results = []

    for eps in eps_range:
        for min_samples in min_samples_range:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples, 
metric=metric)
            labels = dbscan.fit_predict(X)

            n_outliers = np.sum(labels == -1)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            outlier_ratio = round(n_outliers / len(labels), 3)

            results.append({
                "eps": eps,
                "min_samples": min_samples,
                "n_clusters": n_clusters,
                "n_outliers": n_outliers,
                "outlier_ratio": outlier_ratio
            })

    return pd.DataFrame(results)

In [None]:
eps_vals = np.arange(0.15, 1, 0.01)
min_samples_vals = range(5, 40)

results_df = auto_tune_dbscan(tfid_df, eps_range=eps_vals, min_samples_range=min_samples_vals, metric="cosine")