# CS-4824 Intro to Machine Learning
# Rohan Jaggannagari
# Anime Machine Learning
Note: sources.txt shows which sources were used for this project

## Imports

In [1]:
import pandas as pd

***

## Read in the data
I read in the data that was cleaned from the other script.

In [2]:
anime_df = pd.read_csv("clean_data/clean_anime_data.csv")
pd.set_option('display.max_columns', 100) # Wanted to see all of the columns

# Getting rid of these columns as of now, but may add them back if I can process them in a proper way
drop_columns = ['English name', 'Synopsis', 'Duration', 'Unnamed: 0']
anime_df = anime_df.drop(columns = drop_columns)

# Get rid of row with null values
anime_df = anime_df.dropna()

anime_df.head(5)

Unnamed: 0,Name,Score,Genres,Type,Episodes,Producers,Licensors,Studios,Source,Rating
0,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,26.0,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,R - 17+ (violence & profanity)
1,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",Movie,1.0,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,R - 17+ (violence & profanity)
2,Trigun,8.22,"Action, Adventure, Sci-Fi",TV,26.0,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,PG-13 - Teens 13 or older
3,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,26.0,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,PG-13 - Teens 13 or older
4,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",TV,52.0,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,PG - Children


***

## Data Processing

### Make episodes all int and reducing their weights
I need to get rid of any of the rows that do not have episodes since that goes into my machine learning algorithm. An alternative could have been to give those rows a standardized number, but I decided against that. Also, I need to reduce the weights of the episodes since it is calculated using the euclidean distance. 

In [3]:
anime_df['Episodes'] = anime_df['Episodes'].astype(float)
anime_df['Episodes'] = anime_df['Episodes'] * 0.05

anime_df.head(5)

Unnamed: 0,Name,Score,Genres,Type,Episodes,Producers,Licensors,Studios,Source,Rating
0,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,1.3,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,R - 17+ (violence & profanity)
1,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",Movie,0.05,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,R - 17+ (violence & profanity)
2,Trigun,8.22,"Action, Adventure, Sci-Fi",TV,1.3,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,PG-13 - Teens 13 or older
3,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,1.3,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,PG-13 - Teens 13 or older
4,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",TV,2.6,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,PG - Children


### Increase the weight for the score
I want to increase the weight for the score so that it is weighted more during the clustering.

In [4]:
anime_df['Score'] = anime_df['Score'] * 2.5
anime_df.head(5)

Unnamed: 0,Name,Score,Genres,Type,Episodes,Producers,Licensors,Studios,Source,Rating
0,Cowboy Bebop,21.875,"Action, Award Winning, Sci-Fi",TV,1.3,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,R - 17+ (violence & profanity)
1,Cowboy Bebop: Tengoku no Tobira,20.95,"Action, Sci-Fi",Movie,0.05,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,R - 17+ (violence & profanity)
2,Trigun,20.55,"Action, Adventure, Sci-Fi",TV,1.3,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,PG-13 - Teens 13 or older
3,Witch Hunter Robin,18.125,"Action, Drama, Mystery, Supernatural",TV,1.3,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,PG-13 - Teens 13 or older
4,Bouken Ou Beet,17.35,"Adventure, Fantasy, Supernatural",TV,2.6,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,PG - Children


### Encoding ratings
Ratings is a scale so we would want to start all ages as 0 and R+ as higher so the recomendation system accounts for that. I could have used a label encoder, but it would have randomly assigned those values and I needed there to be a followed pattern for it.

In [5]:
# All the categories that are possible
rating_categories = ['G - All Ages', 'PG - Children', 'PG-13 - Teens 13 or older', 'R - 17+ (violence & profanity)', 
                     'R+ - Mild Nudity', 'UNKNOWN']

rating_encoded = list() # Empty list that is going to be added too

# Iterate through each row and find append to a list
for i, current_row in anime_df.iterrows():
    if (current_row['Rating'] == rating_categories[0]):
        rating_encoded.append(0)
    elif (current_row['Rating'] == rating_categories[1]):
        rating_encoded.append(1)
    elif (current_row['Rating'] == rating_categories[2]):
        rating_encoded.append(2)
    elif (current_row['Rating'] == rating_categories[3]):
        rating_encoded.append(3)   
    elif (current_row['Rating'] == rating_categories[4]):
        rating_encoded.append(4)
    elif (current_row['Rating'] == rating_categories[5]):
        rating_encoded.append(5)

# Add the list as a column and drop the old ratings columns as it is not needed
anime_df['Rating Encoded'] = rating_encoded
anime_df = anime_df.drop(columns = 'Rating')

anime_df.head(5)

Unnamed: 0,Name,Score,Genres,Type,Episodes,Producers,Licensors,Studios,Source,Rating Encoded
0,Cowboy Bebop,21.875,"Action, Award Winning, Sci-Fi",TV,1.3,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,3
1,Cowboy Bebop: Tengoku no Tobira,20.95,"Action, Sci-Fi",Movie,0.05,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,3
2,Trigun,20.55,"Action, Adventure, Sci-Fi",TV,1.3,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,2
3,Witch Hunter Robin,18.125,"Action, Drama, Mystery, Supernatural",TV,1.3,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,2
4,Bouken Ou Beet,17.35,"Adventure, Fantasy, Supernatural",TV,2.6,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,1


### Total genres
I need to get all of the genres to see how many unique ones there are. Depending on how many genres there are that is how many dimensions we will be adding to our dataframe.

In [6]:
total_genres = list()

for i, current_row in anime_df.iterrows():
    # Need to put the genres in a list
    split_genres = current_row['Genres'].split(',')
    
    # Iterate through the genres in the the row
    for g in range (0, len(split_genres)):
        # Get rid of any leading white space
        split_genres[g] = split_genres[g].strip()
        
        # Add the unique ones
        if (split_genres[g] not in total_genres):
            total_genres.append(split_genres[g])

print("All genres: " + str(total_genres))
print("Number of genres: " + str(len(total_genres)))

All genres: ['Action', 'Award Winning', 'Sci-Fi', 'Adventure', 'Drama', 'Mystery', 'Supernatural', 'Fantasy', 'Comedy', 'Romance', 'Suspense', 'Sports', 'Ecchi', 'Gourmet', 'Avant Garde', 'Horror', 'Slice of Life', 'Girls Love', 'Boys Love', 'UNKNOWN', 'Erotica']
Number of genres: 21


### Total type
I need to get all of the types to see how many unique ones there are. Depending on how many types there are that is how many dimensions we will be adding to our dataframe.

In [7]:
total_types = list(anime_df['Type'].unique())

print("All types: " + str(total_types))
print("Number of types: " + str(len(total_types)))

All types: ['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music', 'UNKNOWN']
Number of types: 7


### One hot encoding

#### Genre
I needed to create a new column for each genre so that I can cluster it properly later. I would have used the sklearn library, but it would have created a unique one for combination. This way of doing it manually makes it easier to analyze.

In [8]:
# Add a column for each genre and set it to 0
for genre in total_genres:
    anime_df[genre] = 0
    
for i, current_row in anime_df.iterrows():
    split_genres = current_row['Genres'].split(',')
    
    for f in range (0, len(split_genres)):
        split_genres[f] = split_genres[f].strip()
        
        # If found, set that genre column to 1
        if (split_genres[f] in split_genres):
            anime_df.at[i, split_genres[f]] = 1

# Get rid of the original column
anime_df = anime_df.drop(columns = 'Genres')

anime_df.head(5)

Unnamed: 0,Name,Score,Type,Episodes,Producers,Licensors,Studios,Source,Rating Encoded,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Comedy,Romance,Suspense,Sports,Ecchi,Gourmet,Avant Garde,Horror,Slice of Life,Girls Love,Boys Love,UNKNOWN,Erotica
0,Cowboy Bebop,21.875,TV,1.3,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Cowboy Bebop: Tengoku no Tobira,20.95,Movie,0.05,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Trigun,20.55,TV,1.3,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Witch Hunter Robin,18.125,TV,1.3,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Bouken Ou Beet,17.35,TV,2.6,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Type
I needed to one hot encode all of the types so that I can cluster it more accurately.

In [9]:
# Add a column for each type and set it to 0
for c_type in total_types:
    if (c_type != 'UNKNOWN'):
        anime_df[c_type] = 0
        
for i, current_row in anime_df.iterrows():
    stripped_type = current_row['Type']
    
    for f in range (0, len(total_types)):
        if (stripped_type == total_types[f]):
            anime_df.at[i, total_types[f]] = 1
            
# Get rid of the original column
anime_df = anime_df.drop(columns = 'Type')
        
anime_df.head(5)

Unnamed: 0,Name,Score,Episodes,Producers,Licensors,Studios,Source,Rating Encoded,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Comedy,Romance,Suspense,Sports,Ecchi,Gourmet,Avant Garde,Horror,Slice of Life,Girls Love,Boys Love,UNKNOWN,Erotica,TV,Movie,OVA,Special,ONA,Music
0,Cowboy Bebop,21.875,1.3,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,Cowboy Bebop: Tengoku no Tobira,20.95,0.05,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,Trigun,20.55,1.3,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,Witch Hunter Robin,18.125,1.3,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,Bouken Ou Beet,17.35,2.6,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
