# User-based recommendation on user clusters (based on likes)

## Import

In [108]:
import pandas as pd
import ast

In [109]:
data_profiles = pd.read_csv('../data/profiles_clean.csv')
data_reviews = pd.read_csv('../data/reviews_clean.csv')

display(data_profiles.head(2))
display(data_reviews.head(2))

Unnamed: 0,profile,gender,birthday,favorites_anime,link,age
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche,26.0
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans,20.0


Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117


## Preprocessing

In [110]:
data_profiles['favorites_anime'] = data_profiles['favorites_anime'].apply(ast.literal_eval)

df_als_favorite = data_profiles[["profile", "favorites_anime"]].copy().explode('favorites_anime')
df_als_favorite = df_als_favorite.dropna(subset=["favorites_anime"])
df_als_favorite["favorites_anime"] = df_als_favorite["favorites_anime"].astype('int64')
df_als_favorite["is_favorite"] = 1

display(df_als_favorite.head(2))

Unnamed: 0,profile,favorites_anime,is_favorite
0,DesolatePsyche,33352,1
0,DesolatePsyche,25013,1


In [111]:
df_als_score = data_reviews[["profile", "anime_uid", "score"]].copy()

display(df_als_score.head(2))

Unnamed: 0,profile,anime_uid,score
0,DesolatePsyche,34096,8
1,baekbeans,34599,10


In [112]:
df_als_favorite.to_csv("../data/als_is_favorite.csv", index=False)
df_als_score.to_csv("../data/als_score.csv", index=False)

## Spark

For now, this part have to be executed in a ad-hoc Jupyter environment with PySpark, following those steps (disclaimer : you need to install and configure PySpark first) :


```shell
pyspark --name anime-recommendation-engine --driver-java-options -Djava.security.manager=allow
```

```python
sc = SparkSession.builder.getOrCreate()
```

[Medium article](https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS

In [None]:
sc = SparkSession.builder.getOrCreate()

### Is favorite

In [None]:
df_sc_is_favorite = sc.createDataFrame(data_profiles)
df_sc_is_favorite.show()

In [None]:
indexer = StringIndexer(inputCol="profile", outputCol="profile_index")
#pipeline = Pipeline(stages=indexer)
transformed = indexer.fit(df_sc_is_favorite).transform(df_sc_is_favorite)
transformed.show()

In [None]:
(training,test)=transformed.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=5, regParam=0.09, rank=10,
          userCol="profile_index", itemCol="favorites_anime",
          ratingCol="is_favorite", coldStartStrategy="drop", nonnegative=True)

model = als.fit(training)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse",labelCol="is_favorite",predictionCol="prediction")

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE="+str(rmse))

predictions.show()

In [None]:
# We can do the same things, but based on score review for each users