In [1]:
#import the libraries
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#read the data
data = pd.read_csv('anime.csv')

In [3]:
data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
data = data.reset_index()

In [5]:
data

Unnamed: 0,index,anime_id,name,genre,type,episodes,rating,members
0,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...,...
12289,12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [6]:
data.shape

(12294, 8)

In [7]:
data.anime_id.max()

34527

In [8]:
data.rating.min()

1.67

In [9]:
data.describe()

Unnamed: 0,index,anime_id,rating,members
count,12294.0,12294.0,12064.0,12294.0
mean,6146.5,14058.221653,6.473902,18071.34
std,3549.116439,11455.294701,1.026746,54820.68
min,0.0,1.0,1.67,5.0
25%,3073.25,3484.25,5.88,225.0
50%,6146.5,10260.5,6.57,1550.0
75%,9219.75,24794.5,7.18,9437.0
max,12293.0,34527.0,10.0,1013917.0


In [10]:
#checking for null values
data.isnull().sum()

index         0
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [11]:
#rating -- 230/12294=0.018 
#less than 20% so fill the missing values
data['rating'].fillna(data['rating'].mean(),inplace=True)

In [12]:
#filling categorical value
for value in ['genre','type']:
    data[value].fillna(data[value].mode()[0],inplace=True)

In [13]:
data.isnull().sum()

index       0
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [14]:
#check for unique values
for i in data.columns:
    print(f"Unique values for column: {i}\n\n{data[i].unique()}\n")

Unique values for column: index

[    0     1     2 ... 12291 12292 12293]

Unique values for column: anime_id

[32281  5114 28977 ...  5621  6133 26081]

Unique values for column: name

['Kimi no Na wa.' 'Fullmetal Alchemist: Brotherhood' 'Gintama°' ...
 'Violence Gekiga David no Hoshi'
 'Violence Gekiga Shin David no Hoshi: Inma Densetsu'
 'Yasuji no Pornorama: Yacchimae!!']

Unique values for column: genre

['Drama, Romance, School, Supernatural'
 'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen'
 'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen' ...
 'Hentai, Sports' 'Drama, Romance, School, Yuri' 'Hentai, Slice of Life']

Unique values for column: type

['Movie' 'TV' 'OVA' 'Special' 'Music' 'ONA']

Unique values for column: episodes

['1' '64' '51' '24' '10' '148' '110' '13' '201' '25' '22' '75' '4' '26'
 '12' '27' '43' '74' '37' '2' '11' '99' 'Unknown' '39' '101' '47' '50'
 '62' '33' '112' '23' '3' '94' '6' '8' '14' '7' '40' '15' '203' '77' '291'
 '120' '

In [15]:
# Sort by 'rating' only
data = data.sort_values(by='rating', ascending=False)

In [16]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data,test_data = train_test_split(data, test_size=0.2, random_state=42)

In [17]:
print(train_data.shape)
print(test_data.shape)

(9835, 8)
(2459, 8)


In [18]:
#converting text data to feature
vectorizer = TfidfVectorizer()

In [19]:
genre_vectors = vectorizer.fit_transform(train_data['genre'])

In [20]:
from sklearn.preprocessing import MinMaxScaler

# Normalize rating using MinMaxScaler to ensure non-negativity
scaler = MinMaxScaler()
columns = ['rating']
rating_normalized = scaler.fit_transform(train_data[columns])


In [21]:
from scipy.sparse import hstack, csr_matrix

# Convert normalized ratings to a sparse matrix
rating_sparse = csr_matrix(rating_normalized)

# Combine genre vectors and normalized ratings
combined_features = hstack([genre_vectors, rating_sparse])

# Compute cosine similarity
similarity = cosine_similarity(combined_features)

In [22]:
print(similarity)

[[1.         0.42597506 0.25902386 ... 0.5873434  0.42047978 0.2909319 ]
 [0.42597506 1.         0.25025697 ... 0.52883929 0.37393276 0.28108506]
 [0.25902386 0.25025697 1.         ... 0.24280239 0.27903753 0.23144803]
 ...
 [0.5873434  0.52883929 0.24280239 ... 1.         0.35795693 0.27271217]
 [0.42047978 0.37393276 0.27903753 ... 0.35795693 1.         0.31341096]
 [0.2909319  0.28108506 0.23144803 ... 0.27271217 0.31341096 1.        ]]


In [23]:
print(similarity.shape)

(9835, 9835)


In [24]:
def recommend_animes(anime_name, train_data, similarity, top_n=5):
    import difflib
    
    # Get a list of all anime titles in the training data
    list_of_all_titles = train_data['name'].tolist()
    
    # Find the closest match to the input anime name
    find_close_match = difflib.get_close_matches(anime_name, list_of_all_titles)
    
    if not find_close_match:
        print("No close match found.")
        return []
    
    close_match = find_close_match[0]
    index_of_the_anime = train_data[train_data['name'] == close_match]['index'].values
    
    if len(index_of_the_anime) == 0:
        print("Anime not found in the dataset.")
        return []
    
    index_of_the_anime = index_of_the_anime[0]
    
    # Ensure index is within the bounds of the similarity matrix
    if index_of_the_anime < len(similarity):
        similarity_score = list(enumerate(similarity[index_of_the_anime]))
        sorted_similar_animes = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        
        print('Animes suggested for you:\n')
        recommendations = []
        i = 1
        for anime in sorted_similar_animes:
            index = anime[0]
            # Ensure the index is valid and exclude the input anime from recommendations
            if index < len(train_data) and index != index_of_the_anime:
                title_from_index = train_data[train_data['index'] == index]['name'].values
                if title_from_index.size > 0 and i <= top_n:
                    print(i, '.', title_from_index[0])
                    recommendations.append(title_from_index[0])
                    i += 1
        
        return recommendations
    else:
        print("Index out of bounds.")
        return []

# Example usage:
recommendations = recommend_animes('Naruto', train_data, similarity, top_n=5)


Animes suggested for you:

1 . Area no Kishi
2 . Kakumeiki Valvrave
3 . Eigo de Asobo: Tanken Goblin Tou
4 . SKET Dance
5 . Future Card Buddyfight Recap


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Example usage:
anime_name = 'One Piece'
recommendations = recommend_animes(anime_name, train_data, similarity, top_n=5)

# Assuming the first recommendation as the predicted value
predicted = recommendations[0] if recommendations else None

# Ground truth: Assuming 'Naruto' itself should be among the top recommendations
ground_truth = anime_name

# Evaluation
y_true = [ground_truth]  # Ground truth value
y_pred = [predicted] if predicted else []  # Predicted value

# Calculate precision, recall, F1 score, and accuracy
precision = precision_score(y_true, y_pred, average='macro', zero_division=1)
recall = recall_score(y_true, y_pred, average='macro', zero_division=1)
f1 = f1_score(y_true, y_pred, average='macro', zero_division=1)
accuracy = accuracy_score(y_true, y_pred)

print(f'Recommendations: {recommendations}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Accuracy: {accuracy:.2f}')


Animes suggested for you:

1 . Shouwa Monogatari
2 . DNA Sights 999.9
3 . Onegai My Melody Sukkiri♪
4 . Queen&#039;s Blade: Utsukushiki Toushitachi
5 . Ginga Eiyuu Densetsu
Recommendations: ['Shouwa Monogatari', 'DNA Sights 999.9', 'Onegai My Melody Sukkiri♪', 'Queen&#039;s Blade: Utsukushiki Toushitachi', 'Ginga Eiyuu Densetsu']
Precision: 0.50
Recall: 0.50
F1 Score: 0.00
Accuracy: 0.00
