In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from wordcloud import WordCloud
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from matplotlib import cm
from statistics import mean 

In [None]:
anime = pd.read_csv("../input/anime-recommendation-database-2020/anime.csv")
user = pd.read_csv("../input/anime-recommendation-database-2020/animelist.csv")

In [None]:
anime.head()

In [None]:
user.head()

In [None]:
print(user.shape)

In [None]:
user = user.loc[:60000, :] 
user.shape

In [None]:
user.user_id.unique().sum()

In [None]:
rating_mu_user = user.groupby("user_id").mean().loc[:, ["rating", "watched_episodes"]]
rating_mu_user.sort_values("rating", ascending=False)[:10]

In [None]:
rating_mu_user.sort_values("rating", ascending=False)[-10:]

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16,5))
ax = axes.ravel()
sns.scatterplot(data=rating_mu_user, x="rating", y="watched_episodes", ax=ax[0])
corr = rating_mu_user.corr().iloc[0, 1]
ax[0].set_title(f"corr: {corr:.3f}")
sns.histplot(rating_mu_user.rating, ax=ax[1])
sns.histplot(rating_mu_user.watched_episodes, ax=ax[2])

In [None]:
rating_mu_user.columns = ["mu_rating", "mu_watched_episodes"]
above_avg_user = pd.merge(user, rating_mu_user, how="left", left_on="user_id", right_on="user_id")
above_avg_user = above_avg_user[above_avg_user.rating > above_avg_user.mu_rating]
above_avg_user.head()

In [None]:
status = above_avg_user.watching_status.value_counts()
status = pd.DataFrame({"watching_status_cnt": status.values, "watching_status": status.index})
status_list = pd.read_csv("../input/anime-recommendation-database-2020/watching_status.csv")
status = pd.merge(status, status_list, how="left", left_on="watching_status", right_on="status")

plt.figure(figsize=(8, 8))
plt.pie(x=status["watching_status_cnt"], labels=status[status.columns[3]], counterclock=False,
        autopct="%1.1f%%", startangle=90, shadow=True)
plt.legend(status[status.columns[3]])
plt.show()

In [None]:
above_avg_user = \
above_avg_user[above_avg_user.watching_status == 1].drop(["mu_rating", "watched_episodes", "watching_status"], axis=1)
above_avg_user = above_avg_user.rename(columns={"mu_watched_episodes": "watched_episodes"})
above_avg_user.head()

In [None]:
above_avg_user.tail()

In [None]:
anime = anime.drop(["English name", "Japanese name", "Aired" ,"Premiered", "Type", 
                    "Producers", "Licensors", "Studios", "Source", "Duration", "Rating"], axis=1)
anime["Score-10"] = anime["Score-10"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-9"] = anime["Score-9"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-8"] = anime["Score-8"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-7"] = anime["Score-7"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-6"] = anime["Score-6"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-5"] = anime["Score-5"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-4"] = anime["Score-4"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-3"] = anime["Score-3"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-2"] = anime["Score-2"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime["Score-1"] = anime["Score-1"].apply(lambda x: 0.0 if x == "Unknown" else float(x))
anime.head()

In [None]:
# NPS : Positive-rate(9^10) - Negative-rate(0~6)
anime["all_score"] = anime["Score-10"] + anime["Score-9"] + anime["Score-8"] + anime["Score-7"]+ \
                    anime["Score-6"] + anime["Score-5"] + anime["Score-4"] + anime["Score-3"] + \
                    anime["Score-2"] + anime["Score-1"]
anime["positive"] = (anime["Score-10"]+ anime["Score-9"]) / anime["all_score"]
anime["negative"] = (anime["Score-1"]+ anime["Score-2"]+ anime["Score-3"]+ anime["Score-4"]+ anime["Score-5"]+ anime["Score-6"]) / anime["all_score"]
anime["NPS"] = anime["positive"] - anime["negative"]
anime.drop(["Score-10", "Score-9", "Score-8", "Score-7", "Score-6", "Score-5", "Score-4", "Score-3", "Score-2", "Score-1", "positive", "negative", "all_score"], 
           axis=1, inplace=True)
anime.head()

In [None]:
anime = anime.rename(columns={"MAL_ID": "anime_id", "Score": "anime_rating"})
above_avg_user= above_avg_user.rename(columns={"rating": "user_rating"})
anime_user_list = pd.merge(anime, above_avg_user, how="right", left_on="anime_id", right_on="anime_id")
anime_user_list = anime_user_list.drop("anime_id", axis=1).sort_values("user_id", ascending=True)

In [None]:
user_cnt = anime_user_list.groupby("user_id").mean()
s = MinMaxScaler(feature_range=(0.0, 1.0))
user_cnt_scaler = pd.DataFrame(s.fit_transform(user_cnt),
                               columns=user_cnt.columns, 
                               index=user_cnt.index)

user_cnt_scaler.head()

In [None]:
pca = PCA(random_state=0)
pca.fit(user_cnt_scaler)
ratio = pca.explained_variance_ratio_.cumsum()
plt.plot(range(len(ratio)), ratio)
plt.axvline(np.argmax(ratio >= 0.8), c="r")
plt.grid()
plt.show()

In [None]:
plt.scatter(user_cnt_scaler.iloc[:, 0], user_cnt_scaler.iloc[:, 1])
plt.xlabel("principal_1")
plt.ylabel("principal_2")
plt.show()

In [None]:
pca = PCA(n_components=2, random_state=0)
pca_user = pca.fit_transform(user_cnt_scaler)

scores = []
inertia_list = np.empty(10)

for i in range(2,10):
    kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10,
                    max_iter=300, tol=1e-04, random_state=0)
    kmeans.fit(pca_user)
    inertia_list[i] = kmeans.inertia_
    scores.append(silhouette_score(pca_user, kmeans.labels_))

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(range(2, 10), scores)
plt.grid()
plt.subplot(1, 2, 2)
plt.plot(range(0, 10), inertia_list)
plt.grid()

In [None]:
km = KMeans(n_clusters=5, init='k-means++', n_init=10,
            max_iter=300, tol=1e-04, random_state=0)
y_km = km.fit_predict(pca_user)
cluster_labels = np.unique(y_km)       # y_kmの要素の中で重複を無くす
n_clusters=cluster_labels.shape[0]     # 配列の長さを返す。つまりここでは n_clustersで指定した3となる

# シルエット係数を計算
silhouette_vals = silhouette_samples(pca_user,y_km,metric='euclidean')  # サンプルデータ, クラスター番号、ユークリッド距離でシルエット係数計算
y_ax_lower, y_ax_upper= 0,0
yticks = []

for i,c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_km==c]      # cluster_labelsには 0,1,2が入っている（enumerateなのでiにも0,1,2が入ってる（たまたま））
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)              # サンプルの個数をクラスターごとに足し上げてy軸の最大値を決定
        color = cm.jet(float(i)/n_clusters)               # 色の値を作る
        plt.barh(range(y_ax_lower,y_ax_upper),            # 水平の棒グラフのを描画（底辺の範囲を指定）
                         c_silhouette_vals,               # 棒の幅（1サンプルを表す）
                         height=1.0,                      # 棒の高さ
                         edgecolor='none',                # 棒の端の色
                         color=color)                     # 棒の色
        yticks.append((y_ax_lower+y_ax_upper)/2)          # クラスタラベルの表示位置を追加
        y_ax_lower += len(c_silhouette_vals)              # 底辺の値に棒の幅を追加

silhouette_avg = np.mean(silhouette_vals)                 # シルエット係数の平均値
plt.axvline(silhouette_avg,color="red",linestyle="--")    # 係数の平均値に破線を引く 
plt.yticks(yticks,cluster_labels + 1)                     # クラスタレベルを表示
plt.ylabel('Cluster')
plt.xlabel('silhouette coefficient')
plt.show()

In [None]:
user_cnt_scaler["cluster"] = y_km 

fig, axes = plt.subplots(2, 3, figsize=(12, 8))
ax = axes.ravel()

ax[0].scatter(pca_user[:, 0][y_km == 0], pca_user[:, 1][y_km == 0], c=(0, 0, 0))
ax[0].scatter(pca_user[:, 0], pca_user[:, 1], alpha=0.2, c=(0, 0, 0))
ax[0].set_title("cluster0")

ax[1].scatter(pca_user[:, 0][y_km == 1], pca_user[:, 1][y_km == 1], c=(0, 0, 0))
ax[1].scatter(pca_user[:, 0], pca_user[:, 1], alpha=0.2, c=(0, 0, 0))
ax[1].set_title("cluster1")

ax[2].scatter(pca_user[:, 0][y_km == 2], pca_user[:, 1][y_km == 2], c=(0, 0, 0))
ax[2].scatter(pca_user[:, 0], pca_user[:, 1], alpha=0.2, c=(0, 0, 0))
ax[2].set_title("cluster2")

ax[3].scatter(pca_user[:, 0][y_km == 3], pca_user[:, 1][y_km == 3], c=(0, 0, 0))
ax[3].scatter(pca_user[:, 0], pca_user[:, 1], alpha=0.2, c=(0, 0, 0))
ax[3].set_title("cluster3")

ax[4].scatter(pca_user[:, 0][y_km == 4], pca_user[:, 1][y_km == 4], c=(0, 0, 0))
ax[4].scatter(pca_user[:, 0], pca_user[:, 1], alpha=0.2, c=(0, 0, 0))
ax[4].set_title("cluster4")


In [None]:
cluster_user = user_cnt_scaler[["cluster"]]
cluster_user["user_id"] = cluster_user.index
cluster_user.index.name = "index"
df_cluster = pd.merge(cluster_user, anime_user_list, how="right", left_on="user_id", right_on="user_id")

c0 = df_cluster[df_cluster.cluster == 0].drop("cluster", axis=1)
c1 = df_cluster[df_cluster.cluster == 1].drop("cluster", axis=1)
c2 = df_cluster[df_cluster.cluster == 2].drop("cluster", axis=1)
c3 = df_cluster[df_cluster.cluster == 3].drop("cluster", axis=1)
c4 = df_cluster[df_cluster.cluster == 4].drop("cluster", axis=1)

In [None]:
def show_word(c0, num_cluster: str="0"):
    genre2count = {}
    name2count = {}
    mu_watch, mu_user_rating, mu_user_nps  = [], [], []
    for genre, name, watch, rating, nps in zip(c0["Genres"].to_list(), 
                                               c0["Name"].to_list(), 
                                               c0["watched_episodes"].to_list(), 
                                              c0["user_rating"].to_list(), 
                                               c0["NPS"].to_list()):
        mu_watch.append(watch)
        mu_user_rating.append(rating)
        mu_user_nps.append(nps)
        for ge in genre.split(","):
            ge = ge.strip()
            if ge not in genre2count:
                genre2count[ge] = 1
            else:
                genre2count[ge] += 1
        name = name.strip()
        if name not in name2count:
            name2count[name] = 1
        else:
            name2count[name] += 1 
    word_genre = WordCloud().generate_from_frequencies(genre2count)
    word_name = WordCloud().generate_from_frequencies(name2count)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 7))
    ax = axes.ravel()
    ax[0].imshow(word_genre)
    ax[0].set_title("Genre", c="g")
    ax[0].set_xticks([])
    ax[0].set_yticks([])

    ax[1].imshow(word_name)
    ax[1].set_title("Name", c="g")
    ax[1].set_xticks([])
    ax[1].set_yticks([])

    plt.title(f"Cluster: {num_cluster}")
    plt.tight_layout()
    result = {
        "watched_episodes": mean(mu_watch),
        "NPS": mean(mu_user_nps),
        "user_rating": mean(mu_user_rating)
    }
    return result 
        

In [None]:
show_word(c0, "0")

In [None]:
show_word(c1, "1")

In [None]:
show_word(c2, "2")

In [None]:
show_word(c3, "3")

In [None]:
show_word(c4, "4")