# CS-4824 Intro to Machine Learning
# Rohan Jaggannagari
# Anime Machine Learning
Note: sources.txt shows which sources were used for this project

## Imports

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.cluster import KMeans
from math import dist

## Read in the data
I read in the data that was cleaned from the other script.

In [2]:
warnings.simplefilter("ignore")

anime_df = pd.read_csv("clean_data/clean_anime_data.csv")
pd.set_option('display.max_columns', 150) # Wanted to see all of the columns

# Getting rid of these columns as of now, but may add them back if I can process them in a proper way
drop_columns = ['English name', 'Synopsis', 'Duration', 'Unnamed: 0', 'Producers', 'Studios']
anime_df = anime_df.drop(columns = drop_columns)
anime_df.set_index('Name', inplace = True)

# Get rid of row with null values
anime_df = anime_df.dropna()

anime_df.head(5)

Unnamed: 0_level_0,Score,Genres,Type,Episodes,Licensors,Source,Rating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,26.0,"Funimation, Bandai Entertainment",Original,R - 17+ (violence & profanity)
Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",Movie,1.0,Sony Pictures Entertainment,Original,R - 17+ (violence & profanity)
Trigun,8.22,"Action, Adventure, Sci-Fi",TV,26.0,"Funimation, Geneon Entertainment USA",Manga,PG-13 - Teens 13 or older
Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,26.0,"Funimation, Bandai Entertainment",Original,PG-13 - Teens 13 or older
Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",TV,52.0,Illumitoon Entertainment,Manga,PG - Children


## Data Processing

### Make episodes all int and reducing their weights
I need to get rid of any of the rows that do not have episodes since that goes into my machine learning algorithm. An alternative could have been to give those rows a standardized number, but I decided against that. Also, I need to reduce the weights of the episodes since it is calculated using the euclidean distance. 

In [3]:
anime_df['Episodes'] = anime_df['Episodes'].astype(float)
anime_df['Episodes'] = anime_df['Episodes'] * 0.05

anime_df.head(5)

Unnamed: 0_level_0,Score,Genres,Type,Episodes,Licensors,Source,Rating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,1.3,"Funimation, Bandai Entertainment",Original,R - 17+ (violence & profanity)
Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",Movie,0.05,Sony Pictures Entertainment,Original,R - 17+ (violence & profanity)
Trigun,8.22,"Action, Adventure, Sci-Fi",TV,1.3,"Funimation, Geneon Entertainment USA",Manga,PG-13 - Teens 13 or older
Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,1.3,"Funimation, Bandai Entertainment",Original,PG-13 - Teens 13 or older
Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",TV,2.6,Illumitoon Entertainment,Manga,PG - Children


### Increase the weight for the score
I want to increase the weight for the score so that it is weighted more.

In [4]:
anime_df['Score'] = anime_df['Score'] * 2.5
anime_df.head(5)

Unnamed: 0_level_0,Score,Genres,Type,Episodes,Licensors,Source,Rating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cowboy Bebop,21.875,"Action, Award Winning, Sci-Fi",TV,1.3,"Funimation, Bandai Entertainment",Original,R - 17+ (violence & profanity)
Cowboy Bebop: Tengoku no Tobira,20.95,"Action, Sci-Fi",Movie,0.05,Sony Pictures Entertainment,Original,R - 17+ (violence & profanity)
Trigun,20.55,"Action, Adventure, Sci-Fi",TV,1.3,"Funimation, Geneon Entertainment USA",Manga,PG-13 - Teens 13 or older
Witch Hunter Robin,18.125,"Action, Drama, Mystery, Supernatural",TV,1.3,"Funimation, Bandai Entertainment",Original,PG-13 - Teens 13 or older
Bouken Ou Beet,17.35,"Adventure, Fantasy, Supernatural",TV,2.6,Illumitoon Entertainment,Manga,PG - Children


### Encoding ratings
Ratings is a scale so we would want to start all ages as 0 and R+ as higher so the recomendation system accounts for that. I could have used a label encoder, but it would have randomly assigned those values and I needed there to be a followed pattern for it.

In [5]:
# All the categories that are possible
rating_categories = ['G - All Ages', 'PG - Children', 'PG-13 - Teens 13 or older', 'R - 17+ (violence & profanity)', 
                     'R+ - Mild Nudity', 'UNKNOWN']

rating_encoded = list() # Empty list that is going to be added too

# Iterate through each row and find append to a list
for i, current_row in anime_df.iterrows():
    if (current_row['Rating'] == rating_categories[0]):
        rating_encoded.append(0)
    elif (current_row['Rating'] == rating_categories[1]):
        rating_encoded.append(1)
    elif (current_row['Rating'] == rating_categories[2]):
        rating_encoded.append(2)
    elif (current_row['Rating'] == rating_categories[3]):
        rating_encoded.append(3)   
    elif (current_row['Rating'] == rating_categories[4]):
        rating_encoded.append(4)
    elif (current_row['Rating'] == rating_categories[5]):
        rating_encoded.append(5)

# Add the list as a column and drop the old ratings columns as it is not needed
anime_df['Rating Encoded'] = rating_encoded
anime_df = anime_df.drop(columns = 'Rating')

anime_df.head(5)

Unnamed: 0_level_0,Score,Genres,Type,Episodes,Licensors,Source,Rating Encoded
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cowboy Bebop,21.875,"Action, Award Winning, Sci-Fi",TV,1.3,"Funimation, Bandai Entertainment",Original,3
Cowboy Bebop: Tengoku no Tobira,20.95,"Action, Sci-Fi",Movie,0.05,Sony Pictures Entertainment,Original,3
Trigun,20.55,"Action, Adventure, Sci-Fi",TV,1.3,"Funimation, Geneon Entertainment USA",Manga,2
Witch Hunter Robin,18.125,"Action, Drama, Mystery, Supernatural",TV,1.3,"Funimation, Bandai Entertainment",Original,2
Bouken Ou Beet,17.35,"Adventure, Fantasy, Supernatural",TV,2.6,Illumitoon Entertainment,Manga,1


### Total genres
I need to get all of the genres to see how many unique ones there are. Depending on how many genres there are that is how many dimensions we will be adding to our dataframe.

In [6]:
total_genres = list()

for i, current_row in anime_df.iterrows():
    # Need to put the genres in a list
    split_genres = current_row['Genres'].split(',')
    
    # Iterate through the genres in the the row
    for g in range (0, len(split_genres)):
        # Get rid of any leading white space
        split_genres[g] = split_genres[g].strip()
        
        # Add the unique ones
        if (split_genres[g] not in total_genres):
            total_genres.append(split_genres[g])

print("All genres: " + str(total_genres))
print("Number of genres: " + str(len(total_genres)))

All genres: ['Action', 'Award Winning', 'Sci-Fi', 'Adventure', 'Drama', 'Mystery', 'Supernatural', 'Fantasy', 'Comedy', 'Romance', 'Suspense', 'Sports', 'Ecchi', 'Gourmet', 'Avant Garde', 'Horror', 'Slice of Life', 'Girls Love', 'Boys Love', 'UNKNOWN', 'Erotica']
Number of genres: 21


### Total licensors
I need to get all of the licensors to see how many unique ones there are. Depending on how many licensors there are that is how many dimensions we will be adding to our dataframe.

In [7]:
total_licensors = list()

for i, current_row in anime_df.iterrows():
    # Need to put the licensors in a list
    split_licensors = current_row['Licensors'].split(',')
    
    # Iterate through the licensors in the the row
    for g in range (0, len(split_licensors)):
        # Get rid of any leading white space
        split_licensors[g] = split_licensors[g].strip()
        
        # Add the unique ones
        if (split_licensors[g] not in total_licensors):
            total_licensors.append(split_licensors[g])

print("All licensors: " + str(total_licensors))
print("Number of licensors: " + str(len(total_licensors)))

All licensors: ['Funimation', 'Bandai Entertainment', 'Sony Pictures Entertainment', 'Geneon Entertainment USA', 'Illumitoon Entertainment', 'VIZ Media', 'Discotek Media', 'Nozomi Entertainment', 'ADV Films', 'GKIDS', 'Manga Entertainment', 'Media Blasters', 'NYAV Post', 'Aniplex of America', 'AnimEigo', 'Sentai Filmworks', 'UNKNOWN', 'Kadokawa Pictures USA', '4Kids Entertainment', 'Maiden Japan', 'Central Park Media', 'Flatiron Film Company', 'Tokyopop', 'Disney Platform Distribution', 'Harmony Gold', 'Synch-Point', 'Nelvana', 'NIS America', 'Inc.', 'Bandai Visual USA', 'AN Entertainment', 'Urban Vision', 'DreamWorks', 'Warner Bros. Pictures', 'DiC Entertainment', 'Enoki Films', 'Saban Entertainment', 'Crimson Star Media', 'Super Techno Arts', 'Kitty Media', 'Anchor Bay Films', 'Crunchyroll', 'Bandai', 'Eleven Arts', 'Miramax Films', 'Streamline Pictures', 'Shout! Factory', 'The Pokemon Company International', 'Voyager Entertainment', 'Konami Cross Media NY', 'Anime Midstream', 'Hasbr

### Total type
I need to get all of the sources to see how many unique ones there are. Depending on how many sources there are that is how many dimensions we will be adding to our dataframe.

In [8]:
total_types = list(anime_df['Type'].unique())

print("All types: " + str(total_types))
print("Number of types: " + str(len(total_types)))

All types: ['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music', 'UNKNOWN']
Number of types: 7


### Total sources
I need to get all of the types to see how many unique ones there are. Depending on how many types there are that is how many dimensions we will be adding to our dataframe.

In [9]:
total_sources = list(anime_df['Source'].unique())

print("All types: " + str(total_sources))
print("Number of types: " + str(len(total_sources)))

All types: ['Original', 'Manga', 'Light novel', 'Visual novel', '4-koma manga', 'Novel', 'Other', 'Game', 'Picture book', 'Unknown', 'Book', 'Music', 'Card game', 'Radio', 'Mixed media', 'Web manga', 'Web novel']
Number of types: 17


### One hot encoding

#### Genre
I needed to create a new column for each genre so that I can cluster it properly later. I would have used the sklearn library, but it would have created a unique one for combination. This way of doing it manually makes it easier to analyze.

In [10]:
# Add a column for each genre and set it to 0
for genre in total_genres:
    anime_df[genre] = 0
    
for i, current_row in anime_df.iterrows():
    split_genres = current_row['Genres'].split(',')
    
    for f in range (0, len(split_genres)):
        split_genres[f] = split_genres[f].strip()
        
        # If found, set that genre column to 1
        if (split_genres[f] in split_genres):
            anime_df.at[i, split_genres[f]] += 1

# Get rid of the original column
anime_df = anime_df.drop(columns = 'Genres')

anime_df.head(5)

Unnamed: 0_level_0,Score,Type,Episodes,Licensors,Source,Rating Encoded,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Comedy,Romance,Suspense,Sports,Ecchi,Gourmet,Avant Garde,Horror,Slice of Life,Girls Love,Boys Love,UNKNOWN,Erotica
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Cowboy Bebop,21.875,TV,1.3,"Funimation, Bandai Entertainment",Original,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cowboy Bebop: Tengoku no Tobira,20.95,Movie,0.05,Sony Pictures Entertainment,Original,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Trigun,20.55,TV,1.3,"Funimation, Geneon Entertainment USA",Manga,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Witch Hunter Robin,18.125,TV,1.3,"Funimation, Bandai Entertainment",Original,2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bouken Ou Beet,17.35,TV,2.6,Illumitoon Entertainment,Manga,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Licensors
I needed to one hot encode all of the licensors so that I can cluster it more accurately.

In [11]:
# Add a column for each genre and set it to 0
for licensors in total_licensors:
    if (licensors != 'UNKNOWN'):
        anime_df[licensors] = 0
    
for i, current_row in anime_df.iterrows():
    split_licensors = current_row['Licensors'].split(',')
    
    for f in range (0, len(split_licensors)):
        split_licensors[f] = split_licensors[f].strip()
        
        # If found, set that genre column to 1
        if (split_licensors[f] in split_licensors):
            anime_df.at[i, split_licensors[f]] += 1

# Get rid of the original column
anime_df = anime_df.drop(columns = 'Licensors')

anime_df.head(5)

Unnamed: 0_level_0,Score,Type,Episodes,Source,Rating Encoded,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Comedy,Romance,Suspense,Sports,Ecchi,Gourmet,Avant Garde,Horror,Slice of Life,Girls Love,Boys Love,UNKNOWN,Erotica,Funimation,Bandai Entertainment,Sony Pictures Entertainment,Geneon Entertainment USA,Illumitoon Entertainment,VIZ Media,Discotek Media,Nozomi Entertainment,ADV Films,GKIDS,Manga Entertainment,Media Blasters,NYAV Post,Aniplex of America,AnimEigo,Sentai Filmworks,Kadokawa Pictures USA,4Kids Entertainment,Maiden Japan,Central Park Media,Flatiron Film Company,Tokyopop,Disney Platform Distribution,Harmony Gold,Synch-Point,Nelvana,NIS America,Inc.,Bandai Visual USA,AN Entertainment,Urban Vision,DreamWorks,Warner Bros. Pictures,DiC Entertainment,Enoki Films,Saban Entertainment,Crimson Star Media,Super Techno Arts,Kitty Media,Anchor Bay Films,Crunchyroll,Bandai,Eleven Arts,Miramax Films,Streamline Pictures,Shout! Factory,The Pokemon Company International,Voyager Entertainment,Konami Cross Media NY,Anime Midstream,Hasbro,Saban Brands,Capcom,Hirameki International,Cinelicious Pics,Arts Magic,Frontier Works,Pied Piper,Mill Creek Entertainment,Marvel Entertainment,Nintendo of America,Dentsu Entertainment USA,Cookie Jar Entertainment,ADK Emotions NY,Ascendent Animation,Ketchup Entertainment,Kadokawa,Ponycan USA,Kuma Holdings,bilibili,NBCUniversal Entertainment Japan,Bandai Namco Games,Critical Mass Video,iQIYI,Muse Communication,Travel Compass
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1
Cowboy Bebop,21.875,TV,1.3,Original,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cowboy Bebop: Tengoku no Tobira,20.95,Movie,0.05,Original,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Trigun,20.55,TV,1.3,Manga,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Witch Hunter Robin,18.125,TV,1.3,Original,2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bouken Ou Beet,17.35,TV,2.6,Manga,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Type
I needed to one hot encode all of the types so that I can cluster it more accurately.

In [12]:
# Add a column for each type and set it to 0
for c_type in total_types:
    if (c_type != 'UNKNOWN'):
        anime_df[c_type] = 0
        
for i, current_row in anime_df.iterrows():
    stripped_type = current_row['Type']
    
    for f in range (0, len(total_types)):
        if (stripped_type == total_types[f]):
            anime_df.at[i, total_types[f]] += 1
            
# Get rid of the original column
anime_df = anime_df.drop(columns = 'Type')
        
anime_df.head(5)

Unnamed: 0_level_0,Score,Episodes,Source,Rating Encoded,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Comedy,Romance,Suspense,Sports,Ecchi,Gourmet,Avant Garde,Horror,Slice of Life,Girls Love,Boys Love,UNKNOWN,Erotica,Funimation,Bandai Entertainment,Sony Pictures Entertainment,Geneon Entertainment USA,Illumitoon Entertainment,VIZ Media,Discotek Media,Nozomi Entertainment,ADV Films,GKIDS,Manga Entertainment,Media Blasters,NYAV Post,Aniplex of America,AnimEigo,Sentai Filmworks,Kadokawa Pictures USA,4Kids Entertainment,Maiden Japan,Central Park Media,Flatiron Film Company,Tokyopop,Disney Platform Distribution,Harmony Gold,Synch-Point,Nelvana,NIS America,Inc.,Bandai Visual USA,AN Entertainment,Urban Vision,DreamWorks,Warner Bros. Pictures,DiC Entertainment,Enoki Films,Saban Entertainment,Crimson Star Media,Super Techno Arts,Kitty Media,Anchor Bay Films,Crunchyroll,Bandai,Eleven Arts,Miramax Films,Streamline Pictures,Shout! Factory,The Pokemon Company International,Voyager Entertainment,Konami Cross Media NY,Anime Midstream,Hasbro,Saban Brands,Capcom,Hirameki International,Cinelicious Pics,Arts Magic,Frontier Works,Pied Piper,Mill Creek Entertainment,Marvel Entertainment,Nintendo of America,Dentsu Entertainment USA,Cookie Jar Entertainment,ADK Emotions NY,Ascendent Animation,Ketchup Entertainment,Kadokawa,Ponycan USA,Kuma Holdings,bilibili,NBCUniversal Entertainment Japan,Bandai Namco Games,Critical Mass Video,iQIYI,Muse Communication,Travel Compass,TV,Movie,OVA,Special,ONA,Music
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1
Cowboy Bebop,21.875,1.3,Original,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Cowboy Bebop: Tengoku no Tobira,20.95,0.05,Original,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Trigun,20.55,1.3,Manga,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Witch Hunter Robin,18.125,1.3,Original,2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Bouken Ou Beet,17.35,2.6,Manga,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


#### Source
I needed to one hot encode all of the sources so that I can cluster it more accurately.

In [13]:
# Add a column for each type and set it to 0
for c_sources in total_sources:
    if (c_sources != 'UNKNOWN'):
        anime_df[c_sources] = 0
        
for i, current_row in anime_df.iterrows():
    stripped_sources = current_row['Source']
    
    for f in range (0, len(total_sources)):
        if (stripped_sources == total_sources[f]):
            anime_df.at[i, total_sources[f]] += 1
            
# Get rid of the original column
anime_df = anime_df.drop(columns = 'Source')
        
anime_df.head(5)

Unnamed: 0_level_0,Score,Episodes,Rating Encoded,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Comedy,Romance,Suspense,Sports,Ecchi,Gourmet,Avant Garde,Horror,Slice of Life,Girls Love,Boys Love,UNKNOWN,Erotica,Funimation,Bandai Entertainment,Sony Pictures Entertainment,Geneon Entertainment USA,Illumitoon Entertainment,VIZ Media,Discotek Media,Nozomi Entertainment,ADV Films,GKIDS,Manga Entertainment,Media Blasters,NYAV Post,Aniplex of America,AnimEigo,Sentai Filmworks,Kadokawa Pictures USA,4Kids Entertainment,Maiden Japan,Central Park Media,Flatiron Film Company,Tokyopop,Disney Platform Distribution,Harmony Gold,Synch-Point,Nelvana,NIS America,Inc.,Bandai Visual USA,AN Entertainment,Urban Vision,DreamWorks,Warner Bros. Pictures,DiC Entertainment,Enoki Films,Saban Entertainment,Crimson Star Media,Super Techno Arts,Kitty Media,Anchor Bay Films,Crunchyroll,Bandai,Eleven Arts,Miramax Films,Streamline Pictures,Shout! Factory,The Pokemon Company International,Voyager Entertainment,Konami Cross Media NY,Anime Midstream,Hasbro,Saban Brands,Capcom,Hirameki International,Cinelicious Pics,Arts Magic,Frontier Works,Pied Piper,Mill Creek Entertainment,Marvel Entertainment,Nintendo of America,Dentsu Entertainment USA,Cookie Jar Entertainment,ADK Emotions NY,Ascendent Animation,Ketchup Entertainment,Kadokawa,Ponycan USA,Kuma Holdings,bilibili,NBCUniversal Entertainment Japan,Bandai Namco Games,Critical Mass Video,iQIYI,Muse Communication,Travel Compass,TV,Movie,OVA,Special,ONA,Music,Original,Manga,Light novel,Visual novel,4-koma manga,Novel,Other,Game,Picture book,Unknown,Book,Card game,Radio,Mixed media,Web manga,Web novel
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1
Cowboy Bebop,21.875,1.3,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cowboy Bebop: Tengoku no Tobira,20.95,0.05,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Trigun,20.55,1.3,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Witch Hunter Robin,18.125,1.3,2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bouken Ou Beet,17.35,2.6,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## K Means Clustering

### Put in chosen animes

In [14]:
chosen_animes = ["Naruto", "Jujutsu Kaisen", "Bleach", "Deathnote"]

chosen_index_values = list()
current_index = 0
for i, current_row in anime_df.iterrows():
    for g in range (0, len(chosen_animes)):
        if (str(chosen_animes[g]) == str(i)):
            chosen_index_values.append(current_index)
    current_index = current_index + 1
chosen_index_values

predict_input = np.array(anime_df.iloc[chosen_index_values].mean(axis = 0))

### Apply k-means

In [15]:
print("Anime Recomendation Results with k-Means")
print("Anime Inputs: " + str(chosen_animes) + '\n')

k_clusters = [10, 25, 50, 75, 100, 200]

for g in range (0, len(k_clusters)):
    km = KMeans(n_clusters = k_clusters[g])
    km.fit(anime_df)

    fitted_clusters = km.labels_
    predicted_cluster = km.predict(predict_input.reshape(1, -1))

    found_index_values = list()
    current_index = 0
    for i, current_row in anime_df.iterrows():
        if (int(fitted_clusters[current_index]) == int(predicted_cluster[0])):
            found_index_values.append(current_index)
        current_index = current_index + 1

    results = list(anime_df.iloc[found_index_values].index)
    
    output_df = pd.DataFrame(results, columns = ['Recomended Animes'])
    output_df['Eucldiean Distance'] = 0
    
    for h, current_r_row in output_df.iterrows():
        current_anime = current_r_row['Recomended Animes']
        
        anime_df_count = 0
        for l, current_a_row in anime_df.iterrows():
            if (str(l) == str(current_anime)):
                anime_data = np.array(anime_df.iloc[anime_df_count])
                euc_dist = dist(anime_data, predict_input)
                output_df['Eucldiean Distance'][h] = euc_dist
                break
                
            anime_df_count = anime_df_count + 1

    print("k Value:" + str(k_clusters[g]))
    
    output_df = output_df.sort_values(by = 'Eucldiean Distance')
    output_df = output_df.reset_index(drop = True)
    
    display(output_df.head(20))

Anime Recomendation Results with k-Means
Anime Inputs: ['Naruto', 'Jujutsu Kaisen', 'Bleach', 'Deathnote']

k Value:10


Unnamed: 0,Recomended Animes,Eucldiean Distance
0,Naruto,1.123116
1,Katekyo Hitman Reborn!,1.919563
2,Yu☆Gi☆Oh! Duel Monsters,2.737307
3,Urusei Yatsura,3.12912
4,Gintama,3.202918
5,Pokemon Diamond & Pearl,3.421795
6,Pokemon Advanced Generation,3.810694
7,Tetsuwan Atom,4.306044
8,Dragon Ball Z,4.748721
9,Pokemon,4.830689


k Value:25


Unnamed: 0,Recomended Animes,Eucldiean Distance
0,Naruto,1.123116
1,Katekyo Hitman Reborn!,1.919563
2,InuYasha,2.299351
3,Fairy Tail,2.586772
4,Tennis no Ouji-sama,2.631447
5,Black Clover,2.730601
6,Yu☆Gi☆Oh! Duel Monsters,2.737307
7,Urusei Yatsura,3.12912
8,Ranma ½,3.151774
9,Gintama,3.202918


k Value:50


Unnamed: 0,Recomended Animes,Eucldiean Distance
0,Naruto,1.123116
1,Katekyo Hitman Reborn!,1.919563
2,InuYasha,2.299351
3,Fairy Tail,2.586772
4,Tennis no Ouji-sama,2.631447
5,Black Clover,2.730601
6,Yu☆Gi☆Oh! Duel Monsters,2.737307
7,Urusei Yatsura,3.12912
8,Ranma ½,3.151774
9,Gintama,3.202918


k Value:75


Unnamed: 0,Recomended Animes,Eucldiean Distance
0,Naruto,1.123116
1,Katekyo Hitman Reborn!,1.919563
2,InuYasha,2.299351
3,Fairy Tail,2.586772
4,Tennis no Ouji-sama,2.631447
5,Black Clover,2.730601
6,Yu☆Gi☆Oh! Duel Monsters,2.737307
7,Urusei Yatsura,3.12912
8,Ranma ½,3.151774
9,Gintama,3.202918


k Value:100


Unnamed: 0,Recomended Animes,Eucldiean Distance
0,Naruto,1.123116
1,Katekyo Hitman Reborn!,1.919563
2,InuYasha,2.299351
3,Fairy Tail,2.586772
4,Tennis no Ouji-sama,2.631447
5,Black Clover,2.730601
6,Yu☆Gi☆Oh! Duel Monsters,2.737307
7,Urusei Yatsura,3.12912
8,Ranma ½,3.151774
9,Gintama,3.202918


k Value:200


Unnamed: 0,Recomended Animes,Eucldiean Distance
0,Naruto,1.123116
1,Katekyo Hitman Reborn!,1.919563
2,Yu☆Gi☆Oh! Duel Monsters,2.737307
3,Urusei Yatsura,3.12912
4,Gintama,3.202918
5,Dragon Ball Z,4.748721
6,Ni Tian Zhizun,4.982776
