In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
pd.set_option('display.max_columns', None)
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pyspark.sql.functions import explode
from pyspark.sql import SparkSession
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
spark = SparkSession.builder.getOrCreate()

In [2]:
def weighted_rating(df,rating_count_col, avg_rating_col):
    m = df[rating_count_col].quantile(0.80)
    C = df[avg_rating_col].mean()
    v = df[rating_count_col]
    R = df[avg_rating_col]
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)


def full_anime_df(rating_df, anime_df, anime_meta):
    #Get the total number of ratings per anime
    count_ratings = rating_df.groupby('anime_id').count().rename(columns={'rating': 'num_ratings'})['num_ratings']
    
    #Combine the meta data with the anime data, and rating data
    anime_full = anime_df.merge(right=anime_meta, how='left', on='anime_id')
    anime_full = anime_full.merge(right=count_ratings, how='left', on='anime_id')
    anime_full = anime_full.drop(columns=['title','title_japanese','title_synonyms', 'type_x',
                                      'episodes_y', 'airing', 'score','scored_by', 'members_y', 'background',
                                     'licensor', 'premiered', 'broadcast', 'related', 'genre_x', 'aired_string'])
    anime_full = anime_full.rename(columns={'rating_x': 'avg_rating','rating_y': 'rating_type', 'genre_y':'genre', 
                                        'members_x': 'members', 'episodes_x':'episodes', 'type_y':'type', 0: 'weighted_rating'})
    anime_full = pd.concat([anime_full, weighted_rating(anime_full, 'members','avg_rating')], axis=1)
    anime_full = anime_full.rename(columns={0: 'weighted_rating'})
    
    #Shortening the rating type categories
    rating_type_dict = {'PG-13 - Teens 13 or older': 'PG-13', 'R - 17+ (violence & profanity)': 'R',
                   'PG - Children': 'PG', 'G - All Ages': 'G', 'R+ - Mild Nudity': 'R+', 
                   'Rx - Hentai':'RX', 'None': 'Unknown'}
    anime_full['rating_type'] = anime_full['rating_type'].map(rating_type_dict).fillna('Unknown')
    
    #Filling NaNs
    anime_full['genre'] = anime_full['genre'].fillna('Unknown')
    anime_full['studio'] = anime_full['studio'].fillna('Unknown')
    anime_full['producer'] = anime_full['producer'].fillna('Unknown')
    
    #Formatting the anime titles
    anime_full['name'] = anime_full['name'].str.title()
    anime_full['title_english'] = anime_full['title_english'].str.title()
    return anime_full

In [3]:
role = 'AmazonSageMaker-ExecutionRole-20200524T114773'

anime_df = pd.read_csv('s3://animerec/Anime_Recommender/data/anime.csv')
rating_df = pd.read_csv('s3://animerec/Anime_Recommender/data/rating.csv')
#Remove the -1's, which are no values for the ratings
rating_df = rating_df[rating_df['rating']!=-1]
anime_meta = pd.read_csv('s3://animerec/Anime_Recommender/data/AnimeList_Meta.csv')
users_meta = pd.read_csv('s3://animerec/Anime_Recommender/data/UserList_Meta.csv')

In [4]:
anime_full = full_anime_df(rating_df, anime_df, anime_meta)
anime_map = anime_full[['anime_id','name','title_english', 'type']]

In [5]:
filt = rating_df.groupby('user_id').count()['rating']
user_ids = filt[filt>50].reset_index()['user_id'].values
over_df = rating_df[rating_df['user_id'].isin(user_ids)]
remaining_df = rating_df[~rating_df['user_id'].isin(user_ids)]
over_df.groupby('user_id').count()['rating'].sort_values()
y=over_df['user_id']
X=over_df.drop(columns=['user_id'])
anime_train, anime_test, user_train, user_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify=y)
train_over_split = pd.concat([anime_train, user_train],axis=1)
train = pd.concat([train_over_split, remaining_df], axis=0)
test = pd.concat([anime_test, user_test],axis=1)

In [6]:
all_spark = spark.createDataFrame(rating_df)
train_spark = spark.createDataFrame(train)
test_spark = spark.createDataFrame(test)

In [7]:
train_data, val_data = train_spark.randomSplit([0.8, 0.2], seed=0)

In [8]:
#RUN HERE
als_model = ALS(
    itemCol='anime_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    maxIter=20,
    regParam=0.1,
    rank=10) 
als_model.setColdStartStrategy("drop")

recommender = als_model.fit(train_data)

In [9]:
preds_train = recommender.transform(train_data)
preds_val = recommender.transform(val_data)

In [10]:
preds_test = recommender.transform(test_spark)
predstest_df = preds_test.toPandas()
rmse_test = np.sqrt(mean_squared_error(predstest_df['rating'],predstest_df['prediction']))
print(rmse_test)

1.1336736298424988


In [11]:
anime_features = recommender.itemFactors
user_features = recommender.userFactors
#10
anime_features.take(1)

[Row(id=20, features=[0.38978341221809387, 1.0419751405715942, 0.0, 0.22514164447784424, 0.8924466967582703, 0.008838926441967487, 1.221415400505066, 0.8977494239807129, 1.10505211353302, 0.5589691400527954])]

In [12]:
# Tuning the model with crossvalidation
paramGrid = ParamGridBuilder() \
    .addGrid(als_model.rank, [5, 10, 15]) \
    .addGrid(als_model.maxIter, [10, 15, 20]) \
    .addGrid(als_model.regParam, [0.1, 0.3, 0.5]) \
    .build()

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',
                                predictionCol='prediction')

crossval = CrossValidator(estimator=als_model,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cvModel = crossval.fit(train_data)

In [13]:
best_model = cvModel.bestModel

In [14]:
# #run me next
preds_cv = best_model.transform(test_spark)
predstest_cvdf = preds_cv.toPandas()
rmse_cv_test = np.sqrt(mean_squared_error(predstest_cvdf['rating'],predstest_cvdf['prediction']))
print(rmse_cv_test)

1.1268130961792433


In [15]:
anime_features_best = best_model.itemFactors
user_features_best = best_model.userFactors
anime_features_best.take(1)
#10

[Row(id=20, features=[0.7130258679389954, 0.940589427947998, 0.9839379787445068, 0.5225687623023987, 0.38873934745788574, 0.0, 1.0954557657241821, 0.8886647820472717, 0.0, 0.02305525913834572, 0.0, 0.9232625365257263, 0.06826217472553253, 0.1739804893732071, 0.706886887550354])]

In [16]:
# preds_cv_train = best_model.transform(train_data)
# preds_cv_val = best_model.transform(val_data)
# predstrain_cvdf = preds_cv_train.toPandas()
# predsval_cvdf = preds_cv_val.toPandas()
# rmse_cv_train = np.sqrt(mean_squared_error(predstrain_cvdf['rating'],predstrain_cvdf['prediction']))
# rmse_cv_val = np.sqrt(mean_squared_error(predsval_cvdf['rating'],predsval_cvdf['prediction']))
# print(rmse_cv_val)

In [21]:
anime_features_best.show()

+---+--------------------+
| id|            features|
+---+--------------------+
| 20|[0.71302587, 0.94...|
| 30|[0.18995094, 0.68...|
| 50|[0.5604036, 1.007...|
| 60|[0.621886, 0.6491...|
| 80|[0.34098896, 0.86...|
| 90|[0.0, 0.68479764,...|
|100|[0.9556084, 0.575...|
|110|[0.51624566, 0.60...|
|120|[1.0026987, 0.680...|
|130|[0.14541881, 0.48...|
|150|[0.5230091, 0.866...|
|160|[0.48701853, 0.52...|
|170|[0.20775515, 1.12...|
|180|[0.6890212, 0.794...|
|190|[0.57296723, 0.74...|
|200|[0.63511837, 0.74...|
|210|[0.36879155, 1.09...|
|220|[0.2209246, 0.0, ...|
|230|[0.6849301, 0.639...|
|240|[0.46028912, 0.62...|
+---+--------------------+
only showing top 20 rows



In [25]:
latent_features = anime_features_best.toPandas()
latent_features

Unnamed: 0,id,features
0,20,"[0.7130258679389954, 0.940589427947998, 0.9839..."
1,30,"[0.18995094299316406, 0.6859272718429565, 0.93..."
2,50,"[0.56040358543396, 1.0076228380203247, 0.14710..."
3,60,"[0.6218860149383545, 0.6491826772689819, 0.244..."
4,80,"[0.34098896384239197, 0.8695628643035889, 0.70..."
...,...,...
9529,33859,"[0.12080681324005127, 0.16797196865081787, 0.0..."
9530,33909,"[0.3020170331001282, 0.4199299216270447, 0.172..."
9531,33979,"[0.14285366237163544, 0.5676022171974182, 0.74..."
9532,34119,"[0.0, 0.08149053901433945, 0.02465521916747093..."


In [27]:
len(latent_features['features'][0])

15

In [29]:
for i in range(15):
    latent_features[f'feature_{i}'] = latent_features['features'].transform(lambda x: x[i])
latent_features

Unnamed: 0,id,features,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14
0,20,"[0.7130258679389954, 0.940589427947998, 0.9839...",0.713026,0.940589,0.983938,0.522569,0.388739,0.000000,1.095456,0.888665,0.000000,0.023055,0.000000,0.923263,0.068262,0.173980,0.706887
1,30,"[0.18995094299316406, 0.6859272718429565, 0.93...",0.189951,0.685927,0.932175,0.093687,0.000000,0.007285,0.356859,0.797809,0.482862,0.048776,0.694572,1.020931,1.001139,1.621585,0.435651
2,50,"[0.56040358543396, 1.0076228380203247, 0.14710...",0.560404,1.007623,0.147102,0.550896,0.218787,0.051769,0.525946,0.000000,0.885374,0.271161,0.584540,1.207577,0.013918,0.379938,0.564550
3,60,"[0.6218860149383545, 0.6491826772689819, 0.244...",0.621886,0.649183,0.244119,0.194147,0.008703,0.366011,1.072108,0.609705,0.659700,0.520808,0.673579,1.157740,0.482303,0.252433,0.197563
4,80,"[0.34098896384239197, 0.8695628643035889, 0.70...",0.340989,0.869563,0.708788,0.164230,1.006376,0.092147,0.538659,0.332149,0.548580,0.713081,0.446756,0.808641,0.778105,0.801491,0.411796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9529,33859,"[0.12080681324005127, 0.16797196865081787, 0.0...",0.120807,0.167972,0.068924,0.075120,0.074838,0.075786,0.124699,0.128570,0.158707,0.077286,0.000000,0.291810,0.063774,0.111954,0.110421
9530,33909,"[0.3020170331001282, 0.4199299216270447, 0.172...",0.302017,0.419930,0.172310,0.187800,0.187095,0.189465,0.311746,0.321425,0.396767,0.193214,0.000000,0.729524,0.159435,0.279885,0.276051
9531,33979,"[0.14285366237163544, 0.5676022171974182, 0.74...",0.142854,0.567602,0.746345,0.377989,0.749866,0.297297,0.214251,0.093118,1.218332,0.002559,0.286949,0.901122,0.354190,0.181628,0.761172
9532,34119,"[0.0, 0.08149053901433945, 0.02465521916747093...",0.000000,0.081491,0.024655,0.411797,0.505208,0.584855,0.191971,0.000000,0.000000,0.642414,0.321395,0.000000,2.331355,0.541732,0.000000


In [30]:
latent_features = latent_features.drop(columns=['features'])
latent_features

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14
0,20,0.713026,0.940589,0.983938,0.522569,0.388739,0.000000,1.095456,0.888665,0.000000,0.023055,0.000000,0.923263,0.068262,0.173980,0.706887
1,30,0.189951,0.685927,0.932175,0.093687,0.000000,0.007285,0.356859,0.797809,0.482862,0.048776,0.694572,1.020931,1.001139,1.621585,0.435651
2,50,0.560404,1.007623,0.147102,0.550896,0.218787,0.051769,0.525946,0.000000,0.885374,0.271161,0.584540,1.207577,0.013918,0.379938,0.564550
3,60,0.621886,0.649183,0.244119,0.194147,0.008703,0.366011,1.072108,0.609705,0.659700,0.520808,0.673579,1.157740,0.482303,0.252433,0.197563
4,80,0.340989,0.869563,0.708788,0.164230,1.006376,0.092147,0.538659,0.332149,0.548580,0.713081,0.446756,0.808641,0.778105,0.801491,0.411796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9529,33859,0.120807,0.167972,0.068924,0.075120,0.074838,0.075786,0.124699,0.128570,0.158707,0.077286,0.000000,0.291810,0.063774,0.111954,0.110421
9530,33909,0.302017,0.419930,0.172310,0.187800,0.187095,0.189465,0.311746,0.321425,0.396767,0.193214,0.000000,0.729524,0.159435,0.279885,0.276051
9531,33979,0.142854,0.567602,0.746345,0.377989,0.749866,0.297297,0.214251,0.093118,1.218332,0.002559,0.286949,0.901122,0.354190,0.181628,0.761172
9532,34119,0.000000,0.081491,0.024655,0.411797,0.505208,0.584855,0.191971,0.000000,0.000000,0.642414,0.321395,0.000000,2.331355,0.541732,0.000000


In [36]:
latent_features = latent_features.rename(columns={'id':'anime_id'})

In [46]:
merged = pd.merge(left=latent_features, right=anime_full, how='left', on='anime_id')
merged
new = merged.drop(columns=['episodes', 'members','image_url', 'source', 'status', 'aired', 'duration', 'rank', 'popularity', 'favorites', 'opening_theme', 'ending_theme','num_ratings'])

In [73]:
feature = 3
col = f'feature_{feature}'
new[[col, 'name', 'avg_rating', 'title_english', 'type',
       'rating_type', 'producer', 'studio', 'genre', 'weighted_rating']].sort_values(ascending=False, by=col)[:15]

Unnamed: 0,feature_3,name,avg_rating,title_english,type,rating_type,producer,studio,genre,weighted_rating
5633,2.957978,The Embryo Develops Into A Fetus,3.76,,ONA,G,Mirai Film,Unknown,Dementia,6.34791
2436,2.294081,G-Senjou No Higeki,3.68,,Movie,PG-13,Kuri Jikken Manga Kobo,Unknown,"Comedy, Dementia",6.372566
6771,2.258823,Doggy Poo,5.82,Doggy Poo,Movie,G,Itasca Studio,Unknown,"Kids, Fantasy",6.389536
108,2.25848,Gun-Dou Musashi,6.23,Gun Samurai,TV,PG-13,TBS,ACC Production,"Adventure, Samurai, Fantasy",6.42283
4158,2.255039,Mini Moni The Tv,5.84,,TV,G,Unknown,Unknown,Comedy,6.462521
1184,2.121452,Ten Little Gall Force,6.44,,OVA,R+,"AIC, Movic, CBS","Artmic, Animate Film","Action, Military, Sci-Fi, Space, Comedy, Parod...",6.472382
183,2.089012,Guy: Youma Kakusei,5.84,Guy: Double Target,OVA,RX,Unknown,Unknown,"Action, Space, Hentai, Sci-Fi",6.438322
1128,2.065415,Doutei Kawaiya,5.67,,ONA,RX,Kojiro Shishido Animation Works,Unknown,Hentai,6.433655
9041,2.002912,Shinpi No Hou,5.37,The Mystical Laws,Movie,G,"Half H.P Studio, Nikkatsu",HS Pictures Studio,"Adventure, Supernatural",6.398164
4441,1.951987,Inazuma Eleven Go: Tcg Cm Ng-Shuu,6.81,,Special,G,Unknown,OLM,Comedy,6.492967


Explore latent features:
<br>0: RX/mature
<br>1: Unclear, mix of random genres, mature
<br>2: Action/Adventure
<br>3: Dementia, fantasy, sci-fi, magic
<br>4: Unclear, random mix of everything
<br>5: Rated G, comedy, kids
<br>6: Romance, Yaoi
<br>7: Supernatural, PG-13 & G, adventure, sci-fi
<br>8: Fantasy, action, PG-13 & G
<br>9: Comedy, romance, drama & school PG-13 & G
<br>10: Super natural, superpower, action, sci-fi PG-13 & G
<br>11: Horror, dementia, mature content with random kids content
<br>12: Sports, action, slice of life, sci-fi
<br>13: Music/drama
<br>14: Mature content with sports theme 
<br>Has majority of genre in each latent feature, with random sprinkles that don't seem to match