In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/anime.csv')
data.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [3]:
genre = data['genre']
genre.head()

0                 Drama, Romance, School, Supernatural
1    Action, Adventure, Drama, Fantasy, Magic, Mili...
2    Action, Comedy, Historical, Parody, Samurai, S...
3                                     Sci-Fi, Thriller
4    Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: object

In [12]:
genre.isnull().any()

True

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value='')

In [13]:
genre1 = imputer.fit_transform(np.array(genre.values).reshape(-1,1))
genre1

array([['Drama, Romance, School, Supernatural'],
       ['Action, Adventure, Drama, Fantasy, Magic, Military, Shounen'],
       ['Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen'],
       ...,
       ['Hentai'],
       ['Hentai'],
       ['Hentai']], dtype=object)

In [15]:
genre1 = genre1[:,0]
genre1.shape

(12294,)

In [14]:
genre1.isnull().any()

AttributeError: ignored

In [16]:
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf = True,
    strip_accents="unicode",
    analyzer='word',
    token_pattern=r'\w{2,}',
    ngram_range=(1,1),
    max_features=30000
)

In [17]:
word_vectorizer.fit(genre1)
word_features = word_vectorizer.transform(genre1)

In [18]:
print(word_features.shape)

(12294, 46)


In [19]:
print(word_vectorizer.vocabulary_)
print(len(word_vectorizer.vocabulary_))

{'drama': 8, 'romance': 30, 'school': 32, 'supernatural': 41, 'action': 0, 'adventure': 1, 'fantasy': 10, 'magic': 20, 'military': 23, 'shounen': 36, 'comedy': 5, 'historical': 15, 'parody': 26, 'samurai': 31, 'sci': 33, 'fi': 11, 'thriller': 42, 'sports': 39, 'super': 40, 'power': 28, 'space': 38, 'slice': 37, 'life': 19, 'mecha': 22, 'music': 24, 'mystery': 25, 'seinen': 34, 'martial': 21, 'arts': 3, 'vampire': 43, 'shoujo': 35, 'horror': 16, 'police': 27, 'psychological': 29, 'demons': 7, 'ecchi': 9, 'josei': 17, 'ai': 2, 'game': 12, 'dementia': 6, 'harem': 13, 'cars': 4, 'kids': 18, 'hentai': 14, 'yaoi': 44, 'yuri': 45}
46


In [20]:
print(word_vectorizer.idf_)
print(len(word_vectorizer.idf_))

[2.46327818 2.65520297 5.62115741 4.83345165 6.12648851 1.97318604
 4.93215102 4.7299726  2.80758142 3.95860967 2.67194515 2.78116109
 5.21294127 4.65489657 3.37641156 3.72362429 4.50344495 6.40961477
 3.0329585  3.30952248 3.75893691 4.83345165 3.56576303 4.36016394
 3.65885345 4.21037203 4.4032328  5.12868092 4.27276232 4.97886865
 3.12733743 5.41300165 3.30952248 2.78116109 4.11067267 3.93537083
 2.93426613 3.30952248 4.47152735 4.11799871 4.27276232 3.47189689
 5.93961114 5.78221897 6.7280685  6.65574784]
46


In [21]:
wd = dict(zip(word_vectorizer.get_feature_names(),word_vectorizer.idf_))
wd

{'action': 2.463278176000495,
 'adventure': 2.65520296999138,
 'ai': 5.621157409053552,
 'arts': 4.833451645868594,
 'cars': 6.126488513501902,
 'comedy': 1.9731860413199385,
 'dementia': 4.932151021159638,
 'demons': 4.729972598310473,
 'drama': 2.807581416696081,
 'ecchi': 3.9586096713055032,
 'fantasy': 2.6719451511344534,
 'fi': 2.781161093254709,
 'game': 5.212941267573497,
 'harem': 4.6548965718701165,
 'hentai': 3.3764115644343375,
 'historical': 3.7236242863803444,
 'horror': 4.503444949012023,
 'josei': 6.409614769417822,
 'kids': 3.0329584966717844,
 'life': 3.3095224805395884,
 'magic': 3.758936908779545,
 'martial': 4.833451645868594,
 'mecha': 3.5657630271565504,
 'military': 4.360163941421669,
 'music': 3.658853450222562,
 'mystery': 4.210372027925366,
 'parody': 4.403232798607491,
 'police': 5.128680923955757,
 'power': 4.272762320524647,
 'psychological': 4.978868645727097,
 'romance': 3.1273374331991253,
 'samurai': 5.413001648704833,
 'school': 3.3095224805395884,
 's

In [22]:
token_weight = pd.DataFrame(wd.items(), columns = ['word','weight'])
token_weight

Unnamed: 0,word,weight
0,action,2.463278
1,adventure,2.655203
2,ai,5.621157
3,arts,4.833452
4,cars,6.126489
5,comedy,1.973186
6,dementia,4.932151
7,demons,4.729973
8,drama,2.807581
9,ecchi,3.95861


In [23]:
vector = word_vectorizer.transform([genre1[0]])

In [24]:
new = word_vectorizer.transform(genre1)
new.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.29464923, 0.31760665, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25063144, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [25]:
max_val=new.max(axis=0).toarray().ravel()
max_val

array([1.        , 1.        , 0.81692326, 0.70710678, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.70710678, 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.91598014, 1.        , 0.70710678,
       1.        , 0.70710678, 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.93330801, 0.67217652, 1.        ,
       1.        , 1.        , 1.        , 0.70710678, 1.        ,
       1.        , 1.        , 0.70710678, 1.        , 1.        ,
       0.67217652, 1.        , 1.        , 1.        , 1.        ,
       0.89181061])

In [26]:
sort_by_tfidf=max_val.argsort()
sort_by_tfidf

array([28, 40,  3, 21, 11, 33, 19, 37,  2, 45, 17, 27, 29, 30, 31, 35, 34,
       36, 38, 39, 41, 42, 43, 32, 26,  0, 24,  1,  4,  5,  6,  7,  8,  9,
       10, 12, 13, 14, 15, 16, 18, 20, 44, 23, 25, 22])

In [27]:
feature_names = np.array(word_vectorizer.get_feature_names())

In [33]:
print('lowest tfidf:]\n{}'.format(feature_names[sort_by_tfidf[:5]]))
print('highest tfidf:]\n{}'.format(feature_names[sort_by_tfidf[-10:]]))

lowest tfidf:]
['power' 'super' 'arts' 'martial' 'fi']
highest tfidf:]
['harem' 'hentai' 'historical' 'horror' 'kids' 'magic' 'yaoi' 'military'
 'mystery' 'mecha']
