# User-based recommendation on user clusters (based on likes)

## Import

In [1]:
import pandas as pd
import ast

In [2]:
data_profiles = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/profiles_clean.csv")
data_reviews = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/reviews_clean.csv")
data_animes = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/animes_clean.csv", index_col="uid")

display(data_profiles.head(2))
display(data_reviews.head(2))
display(data_animes.head(2))

Unnamed: 0,profile,gender,birthday,favorites_anime,link,age
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche,26.0
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans,20.0


Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117


Unnamed: 0_level_0,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...


## Preprocessing

In [3]:
data_profiles["favorites_anime"] = data_profiles["favorites_anime"].apply(ast.literal_eval)

df_als_favorite = data_profiles[["profile", "favorites_anime"]].copy().explode("favorites_anime")
df_als_favorite = df_als_favorite.dropna(subset=["favorites_anime"])
df_als_favorite["favorites_anime"] = df_als_favorite["favorites_anime"].astype("int64")
df_als_favorite["is_favorite"] = 1

display(df_als_favorite.head(2))

Unnamed: 0,profile,favorites_anime,is_favorite
0,DesolatePsyche,33352,1
0,DesolatePsyche,25013,1


In [4]:
df_als_reviews_score = data_reviews[["profile", "anime_uid", "score"]].copy()

display(df_als_reviews_score.head(2))

Unnamed: 0,profile,anime_uid,score
0,DesolatePsyche,34096,8
1,baekbeans,34599,10


In [5]:
def get_score_by_uid(uid) :

    if uid not in data_animes.index :
        return ""

    return data_animes.at[uid, "score"]

In [6]:
df_als_favorite_score = data_profiles[["profile", "favorites_anime"]].copy().explode("favorites_anime")
df_als_favorite_score = df_als_favorite_score.dropna(subset=["favorites_anime"])
df_als_favorite_score["favorites_anime"] = df_als_favorite_score["favorites_anime"].astype("int64")

display(df_als_favorite_score.head(2))

Unnamed: 0,profile,favorites_anime
0,DesolatePsyche,33352
0,DesolatePsyche,25013


In [7]:
df_to_merge = df_als_favorite_score["favorites_anime"].reset_index().drop_duplicates(subset=["favorites_anime"])
df_to_merge["score"] = df_to_merge["favorites_anime"].apply(get_score_by_uid)
df_to_merge = df_to_merge.dropna(subset=["score"])
df_to_merge = df_to_merge.drop(columns="index")

display(df_to_merge.head(2))

Unnamed: 0,favorites_anime,score
0,33352,8.62
1,25013,8.13


In [8]:
df_als_favorite_score = df_als_favorite_score.merge(df_to_merge, on="favorites_anime")
df_als_favorite_score = df_als_favorite_score[df_als_favorite_score["score"]!=""]

display(df_als_favorite_score.head(2))

Unnamed: 0,profile,favorites_anime,score
0,DesolatePsyche,33352,8.62
1,DesolatePsyche,25013,8.13


In [9]:
df_als_favorite.to_csv("../data/als_is_favorite.csv", index=False)
df_als_reviews_score.to_csv("../data/als_reviews_score.csv", index=False)
df_als_favorite_score.to_csv("../data/als_favorite_score.csv", index=False)

## Spark

For now, this part have to be executed in a ad-hoc Jupyter environment with PySpark, following those steps (disclaimer : you need to install and configure PySpark first) :


```shell
pyspark --name anime-recommendation-engine --driver-java-options -Djava.security.manager=allow
```

```python
sc = SparkSession.builder.getOrCreate()
```

[Medium article simple ALS](https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3)

[Medium article advanced ALS](https://medium.com/@brunoborges_38708/recommender-system-using-als-in-pyspark-10329e1d1ee1)


In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS

In [8]:
sc = SparkSession.builder.getOrCreate()

In [10]:
df_als_favorite = pd.read_csv('../data/als_is_favorite.csv')
df_als_score = pd.read_csv('../data/als_score.csv')
df_als_favorite_score = pd.read_csv('../data/als_favorite_score.csv')

display(df_als_favorite.head(2)) # (216695, 3)
display(df_als_score.head(2)) # (130519, 3)
display(df_als_favorite_score.head(2)) # (216188, 3)

Unnamed: 0,profile,favorites_anime,is_favorite
0,DesolatePsyche,33352,1
1,DesolatePsyche,25013,1


Unnamed: 0,profile,anime_uid,score
0,DesolatePsyche,34096,8
1,baekbeans,34599,10


Unnamed: 0,profile,anime_uid,score
0,DesolatePsyche,34096,8
1,baekbeans,34599,10


In [12]:
df_mean_favorite = df_als_favorite.groupby("profile")["is_favorite"].count().reset_index()
df_mean_favorite["is_favorite"].mean()

5.7842404505779035

In [14]:
df_mean_score = df_als_score.groupby("profile")["score"].count().reset_index()
df_mean_score["score"].mean()

2.7256760989871567

In [16]:
df_mean_favorite_score = df_als_favorite_score.groupby("profile")["score"].count().reset_index()
df_mean_favorite_score["score"].mean()

5.772864428956715

### Is favorite

In [51]:
df_sc_is_favorite = sc.createDataFrame(df_als_favorite)
df_sc_is_favorite.show()

+--------------+---------------+-----------+
|       profile|favorites_anime|is_favorite|
+--------------+---------------+-----------+
|DesolatePsyche|          33352|          1|
|DesolatePsyche|          25013|          1|
|DesolatePsyche|           5530|          1|
|DesolatePsyche|          33674|          1|
|DesolatePsyche|           1482|          1|
|DesolatePsyche|            269|          1|
|DesolatePsyche|          18245|          1|
|DesolatePsyche|           2904|          1|
|DesolatePsyche|          27899|          1|
|DesolatePsyche|          17074|          1|
|DesolatePsyche|          12291|          1|
|DesolatePsyche|            226|          1|
|DesolatePsyche|          28851|          1|
|DesolatePsyche|           8525|          1|
|DesolatePsyche|           6594|          1|
|DesolatePsyche|           4981|          1|
|DesolatePsyche|           1698|          1|
|DesolatePsyche|            457|          1|
|DesolatePsyche|            235|          1|
|DesolateP

In [53]:
indexer = StringIndexer(inputCol="profile", outputCol="profile_index")

transformed = indexer.fit(df_sc_is_favorite).transform(df_sc_is_favorite)
transformed.show()

+--------------+---------------+-----------+-------------+
|       profile|favorites_anime|is_favorite|profile_index|
+--------------+---------------+-----------+-------------+
|DesolatePsyche|          33352|          1|         20.0|
|DesolatePsyche|          25013|          1|         20.0|
|DesolatePsyche|           5530|          1|         20.0|
|DesolatePsyche|          33674|          1|         20.0|
|DesolatePsyche|           1482|          1|         20.0|
|DesolatePsyche|            269|          1|         20.0|
|DesolatePsyche|          18245|          1|         20.0|
|DesolatePsyche|           2904|          1|         20.0|
|DesolatePsyche|          27899|          1|         20.0|
|DesolatePsyche|          17074|          1|         20.0|
|DesolatePsyche|          12291|          1|         20.0|
|DesolatePsyche|            226|          1|         20.0|
|DesolatePsyche|          28851|          1|         20.0|
|DesolatePsyche|           8525|          1|         20.

25/05/28 11:51:17 WARN DAGScheduler: Broadcasting large task binary with size 1224.1 KiB


In [55]:
(training,test)=transformed.randomSplit([0.8, 0.2])

In [57]:
# There's a mean of 5.78 liked animes by profile, so we decide to use
# rank = 5
# maxIter = 10
# regParam = 0.1
als = ALS(rank=5,
          userCol="profile_index", itemCol="favorites_anime",
          ratingCol="is_favorite", coldStartStrategy="drop", nonnegative=True)

model = als.fit(training)

25/05/28 11:51:23 WARN DAGScheduler: Broadcasting large task binary with size 1253.0 KiB
25/05/28 11:51:23 WARN DAGScheduler: Broadcasting large task binary with size 1255.3 KiB
25/05/28 11:51:23 WARN DAGScheduler: Broadcasting large task binary with size 1256.8 KiB
25/05/28 11:51:23 WARN DAGScheduler: Broadcasting large task binary with size 1258.1 KiB
25/05/28 11:51:23 WARN DAGScheduler: Broadcasting large task binary with size 1257.1 KiB
25/05/28 11:51:24 WARN DAGScheduler: Broadcasting large task binary with size 1258.4 KiB
25/05/28 11:51:24 WARN DAGScheduler: Broadcasting large task binary with size 1259.2 KiB
25/05/28 11:51:24 WARN DAGScheduler: Broadcasting large task binary with size 1262.3 KiB
25/05/28 11:51:24 WARN DAGScheduler: Broadcasting large task binary with size 1263.7 KiB
25/05/28 11:51:24 WARN DAGScheduler: Broadcasting large task binary with size 1265.1 KiB
25/05/28 11:51:24 WARN DAGScheduler: Broadcasting large task binary with size 1266.4 KiB
25/05/28 11:51:24 WAR

In [59]:
evaluator = RegressionEvaluator(metricName="rmse",labelCol="is_favorite",predictionCol="prediction")

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE="+str(rmse))

predictions.show()

25/05/28 11:51:30 WARN DAGScheduler: Broadcasting large task binary with size 1235.3 KiB
25/05/28 11:51:30 WARN DAGScheduler: Broadcasting large task binary with size 1297.3 KiB
25/05/28 11:51:30 WARN DAGScheduler: Broadcasting large task binary with size 1295.9 KiB
25/05/28 11:51:31 WARN DAGScheduler: Broadcasting large task binary with size 1343.5 KiB


RMSE=0.09981547241451366


25/05/28 11:51:31 WARN DAGScheduler: Broadcasting large task binary with size 1235.4 KiB
25/05/28 11:51:31 WARN DAGScheduler: Broadcasting large task binary with size 1297.3 KiB
25/05/28 11:51:31 WARN DAGScheduler: Broadcasting large task binary with size 1295.9 KiB


+----------------+---------------+-----------+-------------+----------+
|         profile|favorites_anime|is_favorite|profile_index|prediction|
+----------------+---------------+-----------+-------------+----------+
|     spartan_073|          30654|          1|      14801.0|  0.900804|
|          Ilieas|          17389|          1|       2255.0| 0.9007878|
|      Crusader_8|           1088|          1|      10779.0| 0.9007751|
|         Mac_kun|          30654|          1|      11196.0| 0.9007956|
|     ChoocoboYao|          30654|          1|       1137.0| 0.9007934|
|          Wuster|           1088|          1|      21708.0|0.90077496|
|        pandapow|           1088|          1|      24847.0| 0.9007811|
|Bishounen_Hunter|           1088|          1|      15786.0|0.90078074|
|         MrScout|          30654|          1|      33160.0| 0.9007913|
|  Akuma-Shitsuji|           5300|          1|       9074.0| 0.9007893|
|          sterl1|           1088|          1|       7097.0|0.90

In [79]:
recs=model.recommendForAllUsers(5).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(["recommendations"], axis = 1) \
            .melt(id_vars = ['profile_index'], value_name = "recommendation") \
            .drop("variable", axis = 1) \
            .dropna() 
nrecs=nrecs.sort_values('profile_index')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['profile_index']], axis = 1)
nrecs.columns = [        
        'favorites_anime',
        'is_favorite',
        'UserID_index'      
     ]
md=transformed.select(transformed['profile'],transformed['profile_index'],transformed['favorites_anime'])
md=md.toPandas()
dict1 =dict(zip(md['profile_index'],md['profile']))
nrecs['profile']=nrecs['UserID_index'].map(dict1)
nrecs=nrecs.sort_values('profile')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['profile','favorites_anime','is_favorite']]
new['recommendations'] = list(zip(new.favorites_anime, new.is_favorite))
res=new[['profile','recommendations']]  
df_favorites_based_reco=res['recommendations'].groupby([res.profile]).apply(list).reset_index()
print(df_favorites_based_reco.head())

25/05/28 11:55:16 WARN DAGScheduler: Broadcasting large task binary with size 1344.6 KiB
25/05/28 11:55:18 WARN DAGScheduler: Broadcasting large task binary with size 1338.5 KiB
25/05/28 11:55:25 WARN DAGScheduler: Broadcasting large task binary with size 1223.5 KiB


            profile                                    recommendations
0  -----noname-----  [(23703.0, 0.9009409546852112), (40256.0, 0.90...
1   ---SnowFlake---  [(11859.0, 0.9010751247406006), (23703.0, 0.90...
2          --Mizu--  [(11859.0, 0.9010722041130066), (23703.0, 0.90...
3     --Sunclaudius  [(7103.0, 0.9009591341018677), (23703.0, 0.900...
4   --animeislife--  [(40256.0, 0.900961697101593), (7103.0, 0.9009...


In [None]:
# TODO : export to Pandas DataFrame + export to.csv

In [81]:
df_favorites_based_reco.to_csv("../data/als_is_favorite_based_reco.csv", index=False)

### Score

In [83]:
df_sc_score = sc.createDataFrame(df_als_score)
df_sc_score.show()

+---------------+---------+-----+
|        profile|anime_uid|score|
+---------------+---------+-----+
| DesolatePsyche|    34096|    8|
|      baekbeans|    34599|   10|
|           skrn|    28891|    7|
|   edgewalker00|     2904|    9|
|aManOfCulture99|     4181|   10|
|          eneri|     2904|   10|
| Waffle_Empress|    16664|    6|
|   NIGGER_BONER|     2904|    8|
|         jchang|     2904|   10|
|    shadowsplat|     4181|    4|
|   angelsreview|     4672|    8|
| CalebTheMenace|     4181|    9|
|        Kiethol|     4181|   10|
|          Eanki|    34599|    8|
|      NekoKyupi|    34599|    9|
|          12sed|     5114|   10|
|  OVERPOWERED99|    30276|    8|
|  MrAnimeCrunch|    30276|   10|
|    JoJo_Stalin|    30276|    7|
|        Kaishuu|     4181|   10|
+---------------+---------+-----+
only showing top 20 rows



In [85]:
indexer = StringIndexer(inputCol="profile", outputCol="profile_index")

transformed = indexer.fit(df_sc_score).transform(df_sc_score)
transformed.show()

+---------------+---------+-----+-------------+
|        profile|anime_uid|score|profile_index|
+---------------+---------+-----+-------------+
| DesolatePsyche|    34096|    8|         32.0|
|      baekbeans|    34599|   10|       1104.0|
|           skrn|    28891|    7|       1825.0|
|   edgewalker00|     2904|    9|       3796.0|
|aManOfCulture99|     4181|   10|       9589.0|
|          eneri|     2904|   10|       9872.0|
| Waffle_Empress|    16664|    6|        554.0|
|   NIGGER_BONER|     2904|    8|      13796.0|
|         jchang|     2904|   10|        653.0|
|    shadowsplat|     4181|    4|      45563.0|
|   angelsreview|     4672|    8|          8.0|
| CalebTheMenace|     4181|    9|       5402.0|
|        Kiethol|     4181|   10|       4353.0|
|          Eanki|    34599|    8|        918.0|
|      NekoKyupi|    34599|    9|      13855.0|
|          12sed|     5114|   10|       3999.0|
|  OVERPOWERED99|    30276|    8|         84.0|
|  MrAnimeCrunch|    30276|   10|      2

25/05/28 11:57:46 WARN DAGScheduler: Broadcasting large task binary with size 1979.0 KiB


In [87]:
(training,test)=transformed.randomSplit([0.8, 0.2])

In [89]:
# There's a mean of 2.72 score reviews by profile, so we decide to use
# rank = 3
# maxIter = 10
# regParam = 0.1
als = ALS(rank=2,
          userCol="profile_index", itemCol="anime_uid",
          ratingCol="score", coldStartStrategy="drop", nonnegative=True)

model = als.fit(training)

25/05/28 11:58:12 WARN DAGScheduler: Broadcasting large task binary with size 2007.8 KiB
25/05/28 11:58:12 WARN DAGScheduler: Broadcasting large task binary with size 2010.2 KiB
25/05/28 11:58:13 WARN DAGScheduler: Broadcasting large task binary with size 2011.7 KiB
25/05/28 11:58:13 WARN DAGScheduler: Broadcasting large task binary with size 2013.0 KiB
25/05/28 11:58:13 WARN DAGScheduler: Broadcasting large task binary with size 2011.9 KiB
25/05/28 11:58:14 WARN DAGScheduler: Broadcasting large task binary with size 2013.2 KiB
25/05/28 11:58:14 WARN DAGScheduler: Broadcasting large task binary with size 2014.0 KiB
25/05/28 11:58:14 WARN DAGScheduler: Broadcasting large task binary with size 2017.1 KiB
25/05/28 11:58:14 WARN DAGScheduler: Broadcasting large task binary with size 2018.5 KiB
25/05/28 11:58:14 WARN DAGScheduler: Broadcasting large task binary with size 2019.9 KiB
25/05/28 11:58:14 WARN DAGScheduler: Broadcasting large task binary with size 2021.3 KiB
25/05/28 11:58:14 WAR

In [91]:
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE="+str(rmse))

predictions.show()

25/05/28 11:58:19 WARN DAGScheduler: Broadcasting large task binary with size 1990.2 KiB
25/05/28 11:58:19 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/05/28 11:58:19 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/05/28 11:58:19 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/05/28 11:58:20 WARN DAGScheduler: Broadcasting large task binary with size 1990.3 KiB
25/05/28 11:58:20 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


RMSE=2.5592660458371874


25/05/28 11:58:20 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


+----------------+---------+-----+-------------+----------+
|         profile|anime_uid|score|profile_index|prediction|
+----------------+---------+-----+-------------+----------+
| ZeroTheUltimate|     1088|   10|      15835.0|  8.064406|
|Angel_Apocalypse|    25517|    8|        710.0|  9.111655|
|          Satire|    25517|    7|        338.0| 6.9394593|
|     SupremeLord|    25517|    8|         33.0| 7.2374926|
|           kajia|     1088|    8|         14.0|  7.077851|
|     barcaman101|     1088|   10|        118.0|  8.236349|
|earl_of_sandvich|     6336|   10|         65.0|  9.274968|
|        Norayuki|    30654|    8|       2342.0| 7.0106564|
| LackOfARealLife|    30654|   10|        333.0|    8.6119|
|           Kuta1|    30654|   10|        355.0| 7.4910936|
|          Kyzari|    30654|    9|      13226.0|  8.193876|
|        Kenyanke|    17389|    8|       4352.0| 3.3377738|
|    SovietPsycho|    30654|    8|        339.0| 8.8195095|
|        Ijsberry|    30654|   10|      

In [93]:
recs=model.recommendForAllUsers(5).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(["recommendations"], axis = 1) \
            .melt(id_vars = ['profile_index'], value_name = "recommendation") \
            .drop("variable", axis = 1) \
            .dropna() 
nrecs=nrecs.sort_values('profile_index')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['profile_index']], axis = 1)
nrecs.columns = [        
        'anime_uid',
        'score',
        'UserID_index'      
     ]
md=transformed.select(transformed['profile'],transformed['profile_index'],transformed['anime_uid'])
md=md.toPandas()
dict1 =dict(zip(md['profile_index'],md['profile']))
nrecs['profile']=nrecs['UserID_index'].map(dict1)
nrecs=nrecs.sort_values('profile')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['profile','anime_uid','score']]
new['recommendations'] = list(zip(new.anime_uid, new.score))
res=new[['profile','recommendations']]  
df_score_reviews_based_reco=res['recommendations'].groupby([res.profile]).apply(list).reset_index()
print(df_score_reviews_based_reco.head())

25/05/28 11:58:35 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
25/05/28 11:58:39 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
25/05/28 11:58:46 WARN DAGScheduler: Broadcasting large task binary with size 1978.5 KiB


            profile                                    recommendations
0  -----noname-----  [(5172.0, 10.346378326416016), (19219.0, 10.58...
1   ---SnowFlake---  [(5172.0, 16.226808547973633), (19219.0, 16.50...
2       ---was-----  [(30030.0, 24.334794998168945), (18137.0, 24.2...
3      --EYEPATCH--  [(19219.0, 25.72243309020996), (39415.0, 25.40...
4          --Mizu--  [(2748.0, 26.11934471130371), (4286.0, 27.5448...


In [95]:
df_score_reviews_based_reco.to_csv("../data/als_score_reviews_based_reco.csv", index=False)

### Favorite score

In [18]:
df_sc_favorite_score = sc.createDataFrame(df_als_favorite_score)
df_sc_favorite_score.show()

25/05/28 16:26:33 WARN TaskSetManager: Stage 0 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------+---------------+-------+-------+-----+
|       profile|favorites_anime|score_x|score_y|score|
+--------------+---------------+-------+-------+-----+
|DesolatePsyche|          33352|   8.62|   8.62| 8.62|
|DesolatePsyche|          25013|   8.13|   8.13| 8.13|
|DesolatePsyche|           5530|    7.8|    7.8|  7.8|
|DesolatePsyche|          33674|   8.51|   8.51| 8.51|
|DesolatePsyche|           1482|   8.12|   8.12| 8.12|
|DesolatePsyche|            269|   7.87|   7.87| 7.87|
|DesolatePsyche|          18245|    7.8|    7.8|  7.8|
|DesolatePsyche|           2904|   8.93|   8.93| 8.93|
|DesolatePsyche|          27899|   7.28|   7.28| 7.28|
|DesolatePsyche|          17074|   8.76|   8.76| 8.76|
|DesolatePsyche|          12291|   7.58|   7.58| 7.58|
|DesolatePsyche|            226|   7.69|   7.69| 7.69|
|DesolatePsyche|          28851|   9.01|   9.01| 9.01|
|DesolatePsyche|           8525|   7.82|   7.82| 7.82|
|DesolatePsyche|           6594|   8.39|   8.39| 8.39|
|DesolateP

In [20]:
indexer = StringIndexer(inputCol="profile", outputCol="profile_index")

transformed = indexer.fit(df_sc_favorite_score).transform(df_sc_favorite_score)
transformed.show()

25/05/28 16:26:37 WARN TaskSetManager: Stage 1 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.


+--------------+---------------+-------+-------+-----+-------------+
|       profile|favorites_anime|score_x|score_y|score|profile_index|
+--------------+---------------+-------+-------+-----+-------------+
|DesolatePsyche|          33352|   8.62|   8.62| 8.62|         20.0|
|DesolatePsyche|          25013|   8.13|   8.13| 8.13|         20.0|
|DesolatePsyche|           5530|    7.8|    7.8|  7.8|         20.0|
|DesolatePsyche|          33674|   8.51|   8.51| 8.51|         20.0|
|DesolatePsyche|           1482|   8.12|   8.12| 8.12|         20.0|
|DesolatePsyche|            269|   7.87|   7.87| 7.87|         20.0|
|DesolatePsyche|          18245|    7.8|    7.8|  7.8|         20.0|
|DesolatePsyche|           2904|   8.93|   8.93| 8.93|         20.0|
|DesolatePsyche|          27899|   7.28|   7.28| 7.28|         20.0|
|DesolatePsyche|          17074|   8.76|   8.76| 8.76|         20.0|
|DesolatePsyche|          12291|   7.58|   7.58| 7.58|         20.0|
|DesolatePsyche|            226|  

25/05/28 16:26:38 WARN DAGScheduler: Broadcasting large task binary with size 1224.6 KiB
25/05/28 16:26:38 WARN TaskSetManager: Stage 4 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.


In [22]:
(training,test)=transformed.randomSplit([0.8, 0.2])

In [24]:
# There's a mean of 5.77 liked animes by profile, so we decide to use
# rank = 5
# maxIter = 10
# regParam = 0.1
als = ALS(rank=5,
          userCol="profile_index", itemCol="favorites_anime",
          ratingCol="score", coldStartStrategy="drop", nonnegative=True)

model = als.fit(training)

25/05/28 16:26:50 WARN DAGScheduler: Broadcasting large task binary with size 1252.3 KiB
25/05/28 16:26:50 WARN TaskSetManager: Stage 5 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.
25/05/28 16:26:50 WARN DAGScheduler: Broadcasting large task binary with size 1254.7 KiB
25/05/28 16:26:50 WARN TaskSetManager: Stage 6 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.
25/05/28 16:26:50 WARN DAGScheduler: Broadcasting large task binary with size 1256.2 KiB
25/05/28 16:26:51 WARN DAGScheduler: Broadcasting large task binary with size 1257.5 KiB
25/05/28 16:26:51 WARN DAGScheduler: Broadcasting large task binary with size 1256.4 KiB
25/05/28 16:26:51 WARN DAGScheduler: Broadcasting large task binary with size 1257.7 KiB
25/05/28 16:26:52 WARN DAGScheduler: Broadcasting large task binary with size 1258.5 KiB
25/05/28 16:26:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.b

In [26]:
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE="+str(rmse))

predictions.show()

25/05/28 16:27:24 WARN DAGScheduler: Broadcasting large task binary with size 1235.8 KiB
25/05/28 16:27:24 WARN TaskSetManager: Stage 59 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.
25/05/28 16:27:24 WARN DAGScheduler: Broadcasting large task binary with size 1296.6 KiB
25/05/28 16:27:24 WARN DAGScheduler: Broadcasting large task binary with size 1295.2 KiB
25/05/28 16:27:25 WARN DAGScheduler: Broadcasting large task binary with size 1341.9 KiB


RMSE=0.5261766102761597


25/05/28 16:27:26 WARN DAGScheduler: Broadcasting large task binary with size 1236.4 KiB
25/05/28 16:27:26 WARN TaskSetManager: Stage 114 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.
25/05/28 16:27:26 WARN DAGScheduler: Broadcasting large task binary with size 1296.6 KiB
25/05/28 16:27:26 WARN DAGScheduler: Broadcasting large task binary with size 1295.2 KiB


+--------------+---------------+-------+-------+-----+-------------+----------+
|       profile|favorites_anime|score_x|score_y|score|profile_index|prediction|
+--------------+---------------+-------+-------+-----+-------------+----------+
|     Andrew729|           2605|   7.95|   7.95| 7.95|      29744.0|  7.056234|
|Apomaro-Mellow|          10161|   7.67|   7.67| 7.67|      15447.0| 7.4113045|
|        Breeon|          23273|   8.83|   8.83| 8.83|      15846.0|  8.547776|
|        Breeon|          30654|    8.6|    8.6|  8.6|      15846.0|  8.333976|
|     Flexicute|            210|   7.83|   7.83| 7.83|      12799.0| 7.4189954|
|  brass2themax|            210|   7.83|   7.83| 7.83|      22373.0|  7.448699|
|  brass2themax|           9253|   9.11|   9.11| 9.11|      22373.0|  8.785487|
|   partysmores|            889|    8.1|    8.1|  8.1|       6654.0|   7.79725|
|   partysmores|           9941|    8.0|    8.0|  8.0|       6654.0|  7.737104|
|          paum|           2001|    8.7|

In [28]:
recs=model.recommendForAllUsers(5).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(["recommendations"], axis = 1) \
            .melt(id_vars = ['profile_index'], value_name = "recommendation") \
            .drop("variable", axis = 1) \
            .dropna() 
nrecs=nrecs.sort_values('profile_index')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['profile_index']], axis = 1)
nrecs.columns = [        
        'favorites_anime',
        'score',
        'UserID_index'      
     ]
md=transformed.select(transformed['profile'],transformed['profile_index'],transformed['favorites_anime'])
md=md.toPandas()
dict1 =dict(zip(md['profile_index'],md['profile']))
nrecs['profile']=nrecs['UserID_index'].map(dict1)
nrecs=nrecs.sort_values('profile')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['profile','favorites_anime','score']]
new['recommendations'] = list(zip(new.favorites_anime, new.score))
res=new[['profile','recommendations']]  
df_score_favorite_based_reco=res['recommendations'].groupby([res.profile]).apply(list).reset_index()
print(df_score_favorite_based_reco.head())

25/05/28 16:28:11 WARN DAGScheduler: Broadcasting large task binary with size 1343.9 KiB
25/05/28 16:28:14 WARN DAGScheduler: Broadcasting large task binary with size 1337.9 KiB
25/05/28 16:28:21 WARN DAGScheduler: Broadcasting large task binary with size 1223.4 KiB
25/05/28 16:28:21 WARN TaskSetManager: Stage 218 contains a task of very large size (1012 KiB). The maximum recommended task size is 1000 KiB.


            profile                                    recommendations
0  -----noname-----  [(11467.0, 18.228002548217773), (32188.0, 18.6...
1   ---SnowFlake---  [(11467.0, 18.263790130615234), (1360.0, 17.25...
2          --Mizu--  [(1360.0, 17.87210464477539), (21349.0, 18.597...
3     --Sunclaudius  [(21349.0, 18.819751739501953), (39392.0, 18.7...
4   --animeislife--  [(11467.0, 18.025352478027344), (21349.0, 18.7...


In [30]:
df_score_favorite_based_reco.to_csv("../data/als_score_favorite_based_reco.csv", index=False)

## Analysis

In [181]:
df_score_favorite_based_reco = pd.read_csv("../data/als_is_favorite_based_reco.csv")
df_score_favorite_based_reco.head()

Unnamed: 0,profile,recommendations
0,-----noname-----,"[(23703.0, 0.9009409546852112), (40256.0, 0.90..."
1,---SnowFlake---,"[(11859.0, 0.9010751247406006), (23703.0, 0.90..."
2,--Mizu--,"[(11859.0, 0.9010722041130066), (23703.0, 0.90..."
3,--Sunclaudius,"[(7103.0, 0.9009591341018677), (23703.0, 0.900..."
4,--animeislife--,"[(40256.0, 0.900961697101593), (7103.0, 0.9009..."


In [182]:
df_score_favorite_based_reco["recommendations"][0]

'[(23703.0, 0.9009409546852112), (40256.0, 0.9009571075439453), (36688.0, 0.9009379744529724), (7103.0, 0.9009591937065125), (11859.0, 0.9010590314865112)]'

In [193]:
data_profiles = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/profiles_clean.csv")
data_reviews = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/reviews_clean.csv")
data_animes = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/animes_clean.csv", index_col="uid")

display(data_profiles.head(2))
display(data_reviews.head(2))
display(data_animes.head(2))

Unnamed: 0,profile,gender,birthday,favorites_anime,link,age
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche,26.0
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans,20.0


Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117


Unnamed: 0_level_0,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...


In [None]:
data_profiles[data_profiles["profile"]=="-----noname-----"]["favorites_anime"].values

array(["['6774', '245', '2001', '11061', '16592', '1575', '21']"],
      dtype=object)

In [177]:
display(data_animes[(data_animes.index==6774) | (data_animes.index==245) | 
                    (data_animes.index==2001) | (data_animes.index==11061) | 
                    (data_animes.index==16592) | (data_animes.index==1575) |
                    (data_animes.index==21)])

Unnamed: 0_level_0,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6774,Kuuchuu Buranko,Many patients with different problems visit th...,"['Comedy', 'Psychological', 'Drama', 'Seinen']","Oct 15, 2009 to Dec 24, 2009",11.0,64443,1582,565.0,8.01,https://cdn.myanimelist.net/images/anime/3/183...,https://myanimelist.net/anime/6774/Kuuchuu_Bur...
21,One Piece,"Gol D. Roger was known as the ""Pirate King,"" t...","['Action', 'Adventure', 'Comedy', 'Super Power...","Oct 20, 1999 to ?",,948342,35,86.0,8.53,https://cdn.myanimelist.net/images/anime/6/732...,https://myanimelist.net/anime/21/One_Piece
2001,Tengen Toppa Gurren Lagann,Simon and Kamina were born and raised in a dee...,"['Action', 'Adventure', 'Comedy', 'Mecha', 'Sc...","Apr 1, 2007 to Sep 30, 2007",27.0,982090,29,44.0,8.7,https://cdn.myanimelist.net/images/anime/4/512...,https://myanimelist.net/anime/2001/Tengen_Topp...
245,Great Teacher Onizuka,Onizuka is a reformed biker gang leader who ha...,"['Comedy', 'Drama', 'School', 'Shounen', 'Slic...","Jun 30, 1999 to Sep 17, 2000",43.0,456381,163,40.0,8.71,https://cdn.myanimelist.net/images/anime/13/11...,https://myanimelist.net/anime/245/Great_Teache...
1575,Code Geass: Hangyaku no Lelouch,"In the year 2010, the Holy Empire of Britannia...","['Action', 'Military', 'Sci-Fi', 'Super Power'...","Oct 6, 2006 to Jul 29, 2007",25.0,1231546,11,31.0,8.76,https://cdn.myanimelist.net/images/anime/5/503...,https://myanimelist.net/anime/1575/Code_Geass_...
11061,Hunter x Hunter (2011),Hunter x Hunter is set in a world where Hunte...,"['Action', 'Adventure', 'Fantasy', 'Shounen', ...","Oct 2, 2011 to Sep 24, 2014",148.0,1052761,20,3.0,9.11,https://cdn.myanimelist.net/images/anime/11/33...,https://myanimelist.net/anime/11061/Hunter_x_H...
16592,Danganronpa: Kibou no Gakuen to Zetsubou no Ko...,Hope's Peak Academy is an elite high school th...,"['Mystery', 'Psychological', 'School']","Jul 5, 2013 to Sep 27, 2013",13.0,525383,121,2052.0,7.41,https://cdn.myanimelist.net/images/anime/4/514...,https://myanimelist.net/anime/16592/Danganronp...


In [None]:
33322.0	0.956734	(33322.0, 0.956734299659729)
1	-----noname-----	32895.0	0.956734	(32895.0, 0.956734299659729)
2	-----noname-----	7730.0	0.956734	(7730.0, 0.956734299659729)
3	-----noname-----	33165.0	0.956734	(33165.0, 0.956734299659729)
4	-----noname-----	30702.0	

In [None]:
display(data_animes[(data_animes.index==33322) | (data_animes.index==32895) | 
                    (data_animes.index==7730) | (data_animes.index==33165) | 
                    (data_animes.index==30702)])

Unnamed: 0_level_0,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
36688,Shinmai Maou no Testament Departures,,"['Action', 'Ecchi', 'Fantasy', 'Romance', 'Har...","Mar 28, 2018",1.0,40326,2186,3569.0,7.06,https://cdn.myanimelist.net/images/anime/1395/...,https://myanimelist.net/anime/36688/Shinmai_Ma...
7103,Miracle☆Train: Oedo-sen e Youkoso,Another version of Miracle Train: Chuo-sen e Y...,"['Fantasy', 'Shoujo']","Oct 5, 2009 to Dec 28, 2009",13.0,16501,3508,5578.0,6.6,https://cdn.myanimelist.net/images/anime/9/755...,https://myanimelist.net/anime/7103/Miracle%E2%...
40256,Cannon Busters,"Follow the adventures and exploits of S.A.M, a...","['Action', 'Sci-Fi', 'Adventure', 'Fantasy', '...","Aug 15, 2019",12.0,12621,3980,5366.0,6.64,https://cdn.myanimelist.net/images/anime/1452/...,https://myanimelist.net/anime/40256/Cannon_Bus...
23703,Gatchaman Crowds: Embrace,Director's cut of episode 12 included with the...,"['Adventure', 'Sci-Fi']","Jan 22, 2014",1.0,14617,3707,1939.0,7.44,https://cdn.myanimelist.net/images/anime/6/615...,https://myanimelist.net/anime/23703/Gatchaman_...
11859,Queen's Blade: Rebellion,"Power corrupts, and it when it appears that th...","['Action', 'Adventure', 'Ecchi', 'Fantasy']","Apr 3, 2012 to Jun 19, 2012",12.0,31423,2503,6513.0,6.41,https://cdn.myanimelist.net/images/anime/6/755...,https://myanimelist.net/anime/11859/Queens_Bla...
