In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt, seaborn as sns

from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import StandardScaler

In [None]:
anime = pd.read_csv('/kaggle/input/top-1000-ranked-mangas-by-myanimelist/top_1000.csv')
anime.head()

In [None]:
# Removing the unwanted columns
anime = anime.drop(['Unnamed: 0'], axis=1)
df = anime.drop(['Title_Japanese','Title','Title_Synonym','Synopsis'], axis=1)

In [None]:
df.head()

In [None]:
# 'Not available' publish period is set to the right date based on internet search
df.loc[anime[anime.Publish_period=='Not available'].index,'Publish_period'] = '	April 7, 1974 to August 24, 1975'

In [None]:
# Corresponding publish dates are extracted and new features are formed
df['Publish_on'] = df.Publish_period.apply(lambda x: x.split(' to ')[0])
df['Publish_till'] = df.Publish_period.apply(lambda x: x.split(' to ')[-1])
df = df.drop('Publish_period', axis=1)
df.head()

In [None]:
# Sanity check for same dates on publish start and end
index = df.loc[df.Publish_on==df.Publish_till].index
df.loc[index,'Publish_till'] = 'unknown'

df.loc[df.Publish_on==df.Publish_till]

In [None]:
# Corresponding years are extracted from publish dates

df.Publish_on = df.Publish_on.apply(lambda x: int(x.split(' ')[-1]))

index = df[~df.Publish_till.isin(['present','unknown'])].index
df.loc[index,'Publish_till'] = df.loc[index,'Publish_till'].apply(lambda x: int(x.split(' ')[-1]))

In [None]:
# No of years show has been running is calculated and made a new feature
index = df[~df.Publish_till.isin(['present','unknown'])].index
df['Time_period'] = df.loc[index,'Publish_till'] - df.loc[index,'Publish_on']
df.Time_period = pd.to_numeric(df.Time_period)

# Column for last date of publish is dropped
df = df.drop('Publish_till', axis=1)

In [None]:
# Binning 'Time_period'
df['Time_period'] = pd.qcut(df.Time_period, q=[0,0.2,0.4,0.6,0.8,1],
        labels=['very short','short','moderate','long','very long'])
df.Time_period = df.Time_period.astype('object').fillna('unknown')

df.head()

In [None]:
# Appending the new columns in anime dataframe as well
anime['Publish_on'] = df.Publish_on
anime['Time_period'] = df.Time_period

In [None]:
# All the genres of all shows in the dataframe
lst = (', ').join(df.Genre.value_counts().index).split(', ')
lst = [i[1:-1] for i in lst]

genres = []
for i in lst:
    if i not in genres:
        genres.append(i)
        
genres

In [None]:
# Dummy columns are made for each genre and the initial 'Genre' column is dropped

for j in df.index:
    new = df.loc[j,'Genre'].split(', ')
    new = [i[1:-1] for i in new]
    for i in new:
        df.loc[j,i]=True
        
df[genres] = df[genres].fillna(False)
df = df.drop('Genre', axis=1)

In [None]:
anime = pd.concat([anime,df[genres]], axis=1)

In [None]:
df.head()

In [None]:
# Numerical and categorical features are selected

num_vars = df.select_dtypes(include=['float64','int64']).columns
num_vars = [i for i in num_vars if i not in genres]

cat_vars = [i for i in df.columns if i not in num_vars]

In [None]:
# Scaling of features
scaler = StandardScaler()
df[num_vars] = scaler.fit_transform(df[num_vars])

In [None]:
# Column indices of all categorical variables
categorical = [i[0] for i in enumerate(df.columns) if i[1] in cat_vars]

In [None]:
costs = []
no_of_clusters = [2,3,4,5,6,7,8,9,10]

for i in no_of_clusters:
    kp = KPrototypes(n_clusters=i, n_init=5, random_state=50, n_jobs=-1, verbose=1)
    kp.fit_predict(np.array(df), categorical=categorical)
    costs.append(kp.cost_)

In [None]:
plt.plot(no_of_clusters, costs)
plt.show()

In [None]:
kp = KPrototypes(n_clusters=6, n_init=5, random_state=50, n_jobs=-1, verbose=1)
kp.fit_predict(np.array(df), categorical=categorical)

In [None]:
df['cluster_id'] = kp.labels_
anime['cluster_id'] = kp.labels_
anime.head()

In [None]:
nums = anime.select_dtypes(include=['int64','float64']).columns

for i in nums:
    sns.boxplot(x=anime.cluster_id,y=anime[i])
    plt.show()

### According to the visualizations, cluster 5 seems to be the best set of anime followed by cluster 0. In terms of popularity, cluster 2 anime are more popular since they ate the most recent one.
### Therefore, my personal recommendation is cluster 5 anime.

In [None]:
anime[anime.cluster_id==5].Time_period.value_counts().plot.bar(figsize=(20,8))
plt.xticks(fontsize=15, rotation=0)
plt.yticks(fontsize=15)
plt.title('Time period for Cluster 5', fontsize=30)
plt.show()
anime[anime.cluster_id==0].Time_period.value_counts().plot.bar(figsize=(20,8))
plt.xticks(fontsize=15, rotation=0)
plt.yticks(fontsize=15)
plt.title('Time period for Cluster 0', fontsize=30)
plt.show()

In [None]:
cluster_id = 5
genre_plot = round(100*anime[anime.cluster_id==cluster_id]\
                   .iloc[:,18:-1].sum()/anime[anime.cluster_id==cluster_id].iloc[:,18:-1].sum().sum(),2)
plt.figure(figsize=(20,8))
plt.bar(genre_plot.index, genre_plot.values)
plt.xticks(fontsize=15, rotation=90)
plt.yticks(fontsize=15)
plt.title('Percentage of genres in Cluster 5', fontsize=30)
plt.show()

cluster_id = 0
genre_plot = round(100*anime[anime.cluster_id==cluster_id]\
                   .iloc[:,18:-1].sum()/anime[anime.cluster_id==cluster_id].iloc[:,18:-1].sum().sum(),2)
plt.figure(figsize=(20,8))
plt.bar(genre_plot.index, genre_plot.values)
plt.xticks(fontsize=15, rotation=90)
plt.yticks(fontsize=15)
plt.title('Percentage of genres in Cluster 0', fontsize=30)
plt.show()

### Cluster 0 contains more 'drama', 'romance' and 'comedy' than cluster 5 whereas cluster 5 is mostly contains more 'super power' and 'adventure' genre.
### Thus we can say cluster 5 is mostly subjected to kids whereas cluster 0 to adults.

In [None]:
# Cluster 5 anime list
anime[anime.cluster_id==5].Title

In [None]:
# Cluster 0 anime list
anime[anime.cluster_id==0].Title