# User-based recommendation on user clusters (based on likes)

## Import

In [2]:
import pandas as pd
import ast

In [107]:
data_profiles = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/profiles_clean.csv")

display(data_profiles.head(2))

Unnamed: 0,profile,gender,birthday,favorites_anime,link,age
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche,26.0
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans,20.0


## Preprocessing

In [47]:
data_profiles["favorites_anime"] = data_profiles["favorites_anime"].apply(ast.literal_eval)

df_als_favorite = data_profiles[["profile", "favorites_anime"]].copy().explode("favorites_anime")
df_als_favorite = df_als_favorite.dropna(subset=["favorites_anime"])
df_als_favorite["favorites_anime"] = df_als_favorite["favorites_anime"].astype("int64")
df_als_favorite["is_favorite"] = 1

display(df_als_favorite.head(2))

Unnamed: 0,profile,favorites_anime,is_favorite
0,DesolatePsyche,33352,1
0,DesolatePsyche,25013,1


In [None]:
df_profile_temp = pd.DataFrame(data=df_als_favorite["profile"].unique(), columns=["profile"])
df_top_200_animes = df_als_favorite["favorites_anime"].value_counts().reset_index(name="count").iloc[:200,:]
df_favorites_anime_temp = pd.DataFrame(data=df_top_200_animes["favorites_anime"], columns=["favorites_anime"])
df_temp = df_profile_temp.merge(df_favorites_anime_temp, how="cross")

df_als_favorite = df_als_favorite.merge(df_temp, on=["profile","favorites_anime"], how="outer")
df_als_favorite["is_favorite"] = df_als_favorite["is_favorite"].fillna(0).astype("Int64")

display(df_als_favorite.head(2))

Unnamed: 0,profile,favorites_anime,is_favorite
0,-----noname-----,1,0
1,-----noname-----,6,0


In [None]:
df_als_favorite.to_csv("../data/als_is_favorite.csv", index=False)

## Spark

For now, this part have to be executed in a ad-hoc Jupyter environment with PySpark, following those steps (disclaimer : you need to install and configure PySpark first) :


```shell
pyspark --name anime-recommendation-engine --driver-java-options -Djava.security.manager=allow
```

```python
sc = SparkSession.builder.getOrCreate()
```

[Medium article simple ALS](https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3)

[Medium article advanced ALS](https://medium.com/@brunoborges_38708/recommender-system-using-als-in-pyspark-10329e1d1ee1)


In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rank
from pyspark.sql.window import Window

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [12]:
sc = SparkSession.builder.getOrCreate()

In [14]:
df_als_is_favorite = spark.read.csv("../data/als_is_favorite.csv", header=True)
df_als_is_favorite = df_als_is_favorite.withColumn("favorites_anime", df_als_is_favorite.favorites_anime.cast("int"))
df_als_is_favorite = df_als_is_favorite.withColumn("is_favorite", df_als_is_favorite.is_favorite.cast("int"))

df_als_is_favorite.show()

In [None]:
def pyspark_df_with_train_test_split(df) :
    
    indexer = StringIndexer(inputCol="profile", outputCol="profile_index")

    df_spark = indexer.fit(df).transform(df)

    (training,test) = df_spark.randomSplit([0.8, 0.2])

    return df_spark, training, test

In [None]:
def als_tuning_and_predict(train, test, user_col, item_col, rating_col) :

    als = ALS(rank=10, regParam=.05, userCol=user_col+"_index", itemCol=item_col, ratingCol=rating_col,
            coldStartStrategy="drop", nonnegative=True)

    evaluator = RegressionEvaluator(metricName="rmse", labelCol=rating_col, predictionCol="prediction")

    model = als.fit(train)
    
    predictions = model.transform(test)
    rmse = evaluator.evaluate(predictions)
    
    print("RMSE="+str(rmse))

    return model

In [34]:
def create_predictions_pandas_dataframe(model, df_spark, user_col, item_col, rating_col) :
    
    recs = model.recommendForAllUsers(10).toPandas()

    nrecs = recs.recommendations.apply(pd.Series)\
                .merge(recs, right_index = True, left_index = True)\
                .drop(["recommendations"], axis = 1)\
                .melt(id_vars = [user_col+"_index"], value_name = "recommendation")\
                .drop("variable", axis = 1)\
                .dropna()
    nrecs = nrecs.sort_values(user_col+"_index")
    nrecs = pd.concat([nrecs["recommendation"].apply(pd.Series), nrecs[user_col+"_index"]], axis = 1)
    nrecs.columns = [item_col, rating_col, user_col]

    md = df_spark.select(df_spark[user_col],df_spark[user_col+"_index"],df_spark[item_col])
    md = md.toPandas()

    dict1 = dict(zip(md[user_col+"_index"],md[user_col]))

    nrecs[user_col] = nrecs[user_col].map(dict1)
    nrecs = nrecs.sort_values(user_col)
    nrecs.reset_index(drop=True, inplace=True)
    
    df_recommendations = nrecs[[user_col,item_col]]
    df_recommendations[item_col] = df_recommendations[item_col].astype("int64")

    df_recommendations = df_recommendations[item_col].groupby([df_recommendations[user_col]]).apply(list).reset_index()

    return df_recommendations

### Is favorite

In [36]:
df_sp_is_favorite, train, test = pyspark_df_with_train_test_split(df_als_is_favorite, 0.2, "profile",
                                                                  "favorites_anime", "is_favorite", False)

                                                                                

In [38]:
model = als_tuning_and_predict(train, test, "profile", "favorites_anime", "is_favorite")

25/06/03 15:07:11 WARN DAGScheduler: Broadcasting large task binary with size 1250.1 KiB
25/06/03 15:07:12 WARN DAGScheduler: Broadcasting large task binary with size 1252.5 KiB
25/06/03 15:07:13 WARN DAGScheduler: Broadcasting large task binary with size 1254.0 KiB
25/06/03 15:07:14 WARN DAGScheduler: Broadcasting large task binary with size 1255.3 KiB
25/06/03 15:07:15 WARN DAGScheduler: Broadcasting large task binary with size 1254.2 KiB
25/06/03 15:07:15 WARN DAGScheduler: Broadcasting large task binary with size 1255.5 KiB
25/06/03 15:07:15 WARN DAGScheduler: Broadcasting large task binary with size 1256.3 KiB
25/06/03 15:07:16 WARN DAGScheduler: Broadcasting large task binary with size 1259.4 KiB
25/06/03 15:07:16 WARN DAGScheduler: Broadcasting large task binary with size 1260.8 KiB
25/06/03 15:07:16 WARN DAGScheduler: Broadcasting large task binary with size 1262.2 KiB
25/06/03 15:07:16 WARN DAGScheduler: Broadcasting large task binary with size 1263.6 KiB
25/06/03 15:07:16 WAR

RMSE=0.14823504891617292


25/06/03 15:07:22 WARN DAGScheduler: Broadcasting large task binary with size 1341.9 KiB


In [40]:
df_is_favorite_based_reco = create_predictions_pandas_dataframe(model, df_sp_is_favorite, "profile", "favorites_anime", "is_favorite")

with pd.option_context('display.max_colwidth', None):
  display(df_is_favorite_based_reco.head())

25/06/03 15:07:31 WARN DAGScheduler: Broadcasting large task binary with size 1341.5 KiB
25/06/03 15:07:33 WARN DAGScheduler: Broadcasting large task binary with size 1335.5 KiB
25/06/03 15:07:44 WARN DAGScheduler: Broadcasting large task binary with size 1226.1 KiB
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recommendations[item_col] = df_recommendations[item_col].astype("int64")


Unnamed: 0,profile,favorites_anime
0,-----noname-----,"[8026, 1777, 4150, 3015, 10534, 32365, 12651, 3077, 5877, 2595]"
1,---SnowFlake---,"[8026, 32365, 2595, 15437, 10534, 3015, 12651, 5877, 37760, 3077]"
2,--Mizu--,"[3015, 37760, 5877, 12651, 8026, 2589, 2595, 3077, 32365, 10534]"
3,--Sunclaudius,"[12651, 3077, 3015, 32365, 10534, 2595, 5877, 8026, 1777, 4150]"
4,--animeislife--,"[10534, 8026, 1777, 15437, 3077, 5877, 2595, 3015, 12651, 32365]"


In [34]:
df_is_favorite_based_reco = pd.read_csv("../data/als_is_favorite_based_reco_before_clean.csv", index_col="profile")
df_is_favorite_based_reco = df_is_favorite_based_reco.rename(columns={"favorites_anime": "recommendations"})
df_is_favorite_based_reco["recommendations"] = df_is_favorite_based_reco["recommendations"].apply(ast.literal_eval)

In [35]:
data_profiles = pd.read_csv("https://anime-recommendation-engine.s3.eu-west-3.amazonaws.com/data/profiles_clean.csv", index_col="profile")
data_profiles["favorites_anime"] = data_profiles["favorites_anime"].apply(ast.literal_eval)

In [36]:
def delete_favorites_from_recommendations(df_recommendations, df_favorites) :

    for profile in df_recommendations.index :
        favorites = df_favorites.at[profile,"favorites_anime"]
        recommendations = df_recommendations.at[profile,"recommendations"]

        recommendations = [value for value in recommendations if str(value) not in favorites]
        df_recommendations.at[profile, 'recommendations'] = recommendations

    return df_recommendations

In [37]:
df_is_favorite_based_reco = delete_favorites_from_recommendations(df_is_favorite_based_reco, data_profiles)

In [38]:
df_is_favorite_based_reco.to_csv("../data/als_is_favorite_based_reco.csv")