In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import scipy
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import surprise as sp
import time
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [None]:
#Importing the CSVs to Dataframe format
UsersDF = pd.read_csv('../raw_data/users_cleaned.csv')
AnimesDF = pd.read_csv('../raw_data/anime_cleaned.csv')
ScoresDF = pd.read_csv('../raw_data/animelists_cleaned.csv')

In [None]:
# Preprocessing
AnimesDF = AnimesDF.dropna(subset=['genre'])
AnimesDF['genre'] = AnimesDF['genre'].str.split(', ')

In [None]:
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(AnimesDF['genre'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=AnimesDF.index)
AnimesDF = pd.concat([AnimesDF, genre_df], axis=1)

# Select and scale features
selected_columns = ['score', 'episodes', 'members', 'duration_min'] + list(mlb.classes_)
anime_df_selected = AnimesDF[selected_columns].dropna()
scaler = StandardScaler().set_output(transform = "pandas")
anime_df_scaled = scaler.fit_transform(anime_df_selected)

In [None]:
# Apply PCA
pca = PCA().set_output(transform = "pandas")
features_pca = pca.fit_transform(anime_df_scaled)

explained_variance = pca.explained_variance_ratio_

plt.figure(figsize=(6, 4))
plt.plot(range(len(explained_variance)), np.cumsum(explained_variance), alpha=0.5, label='Cumulative Variance Explained')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.grid()
plt.tight_layout()

# Use PCA to reduce dimensionality to 
pca = PCA(n_components=10)
features_pca = pca.fit_transform(anime_df_scaled)

In [None]:
# SCALING AND PRESERVING COLUMNS'NAMES
scaler = StandardScaler().set_output(transform = "pandas")
anime_df_scaled = scaler.fit_transform(anime_df_selected)
display(anime_df_scaled)

# PCA'ING AND PRESERVING COLUMNS'NAMES
threhsold_pca = 30
pca = PCA(n_components=threhsold_pca, whiten=True).set_output(transform="pandas") # find all the Principal Components PC (no y!)
pca.fit(anime_df_scaled)
features_pca = pca.transform(anime_df_scaled)
display(features_pca)
features_pca = pca.transform(anime_df_scaled)

In [None]:
# Initialize an empty DataFrame to store the results
results = pd.DataFrame(columns=['k', 'Silhouette Score', 'Calinski Harabasz Score', 'Davies Bouldin Score'])

# Loop over the range of k values
for k in range(2, 11):  # we start from 2 because silhouette_score is not defined for a single cluster
    kmeans = KMeans(n_clusters=k, random_state=0, n_init = 'auto')
    clusters = kmeans.fit_predict(features_pca)

    silhouette = silhouette_score(features_pca, clusters)
    calinski_harabasz = calinski_harabasz_score(features_pca, clusters)
    davies_bouldin = davies_bouldin_score(features_pca, clusters)

    results = results.append({'k': k, 'Silhouette Score': silhouette, 'Calinski Harabasz Score': calinski_harabasz, 'Davies Bouldin Score': davies_bouldin}, ignore_index=True)

# Display the results
print(results)


In [None]:
# Add clusters to the DataFrame
AnimesDF['cluster'] = clusters
# Check the number of animes in each cluster
print(AnimesDF['cluster'].value_counts())

In [None]:
# Check the mean values of the features in each cluster
print(AnimesDF.groupby('cluster')[['episodes', 'score', 'members', 'duration_min']].mean())

In [None]:
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

# Create a list of all genres
all_genres = list(set(itertools.chain.from_iterable(GenreDF)))

# Initialize a co-occurrence matrix of zeros
co_occurrence_matrix = pd.DataFrame(np.zeros((len(all_genres), len(all_genres))), index=all_genres, columns=all_genres)

# Iterate through each list of genres and increment co-occurrence matrix for each pair
for genres in GenreDF:
    for genre1, genre2 in itertools.combinations(genres, 2):
        co_occurrence_matrix.loc[genre1, genre2] += 1
        co_occurrence_matrix.loc[genre2, genre1] += 1

# Plot the co-occurrence matrix as a heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(co_occurrence_matrix, cmap='YlGnBu')
plt.title('Genre Co-occurrence Heatmap')
plt.show()


In [None]:
# Apply Elbow method
distortions = []
K = range(1,20)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(features_pca)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
AnimesDF