In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
pd.set_option('display.max_columns', None)
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from src.data_funcs import *
from src.model_funcs import *

In [3]:
anime_df = pd.read_csv('data/anime.csv')
rating_df = pd.read_csv('data/rating.csv')
anime_meta = pd.read_csv('data/AnimeList_meta.csv')
users_meta = pd.read_csv('data/UserList_Meta.csv')

## Model Based Matrix Factorization Recommenders System

In [177]:
filt = rating_df.groupby('user_id').count()['rating']
user_ids = filt[filt>2].reset_index()['user_id'].values
over_2_df = rating_df[rating_df['user_id'].isin(user_ids)]
remaining_df = rating_df[~rating_df['user_id'].isin(user_ids)]
over_2_df.groupby('user_id').count()['rating'].sort_values()
over_2_df

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
156,3,20,8
...,...,...,...
7813730,73515,13659,8
7813731,73515,14345,7
7813732,73515,16512,7
7813733,73515,17187,9


## Train Test Split

In [178]:
y=over_2_df['user_id']
X=over_2_df.drop(columns=['user_id'])
anime_train, anime_test, user_train, user_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify=y)

In [179]:
train_over2_split = pd.concat([anime_train, user_train],axis=1)
train = pd.concat([train_over2_split, remaining_df], axis=0)
train

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,anime_id,rating,user_id
6381259,199,8,59188
3843912,31704,7,35976
3658105,18897,8,34222
3560929,1250,7,32998
3113278,2476,9,28781
...,...,...,...
7813057,6880,8,73505
7813341,199,10,73509
7813538,512,10,73514
7813735,790,9,73516


In [180]:
test = pd.concat([anime_test, user_test],axis=1)
test

Unnamed: 0,anime_id,rating,user_id
3228697,21437,7,29831
393203,1723,10,4063
464724,3712,8,4787
1542021,527,10,14884
1252512,20159,6,11837
...,...,...,...
1700046,635,8,16458
3363340,440,9,30972
1657553,14289,7,16106
5523090,59,10,51976


## Using Spark ALS

In [186]:
from pyspark.sql import SparkSession

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

# Convert a Pandas DF to a Spark DF
#spark_df = spark.createDataFrame(pandas_df) 

# Convert a Spark DF to a Pandas DF
#pandas_df = spark_df.toPandas()

In [187]:
train_spark = spark.createDataFrame(train)
test_spark = spark.createDataFrame(test)
#Note to self: Add some cross validation

In [219]:
from pyspark.ml.recommendation import ALS
als_model = ALS(
    itemCol='anime_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 
als_model.setColdStartStrategy("drop")

ALS_9b3bd4d65bd4

In [220]:
recommender = als_model.fit(train_spark)

In [221]:
os.system("say 'cao ni ma'") 

0

In [222]:
predictions = recommender.transform(test_spark)

0

In [223]:
predictions.show()
os.system("say 'Complete'") 

+--------+------+-------+----------+
|anime_id|rating|user_id|prediction|
+--------+------+-------+----------+
|     148|     6|  49607| 6.2774863|
|     148|     4|  49405| 5.9500914|
|     148|     7|  57661|  7.069685|
|     148|     7|  49418|  6.751053|
|     148|     7|    916|  6.811444|
|     148|    10|   7266|   7.09548|
|     148|     8|  11493| 6.5894094|
|     148|     4|  12431| 4.6443233|
|     148|     7|  55121|  6.000699|
|     148|    10|   6223|  8.358974|
|     148|     6|  61257| 6.5924067|
|     148|     8|  58606| 7.2540507|
|     148|     2|   1497|  6.046259|
|     148|     7|  61445| 6.9690657|
|     148|     8|  13341| 6.7941628|
|     148|     7|  59311|  7.173847|
|     148|     6|  19994| 6.4708133|
|     148|     7|  41536| 6.2356434|
|     148|     4|  58142|    7.0969|
|     148|     6|  30476|  6.423863|
+--------+------+-------+----------+
only showing top 20 rows



In [224]:
predictions_df = predictions.toPandas()
predictions_df

Unnamed: 0,anime_id,rating,user_id,prediction
0,148,6,49607,6.277486
1,148,4,49405,5.950091
2,148,7,57661,7.069685
3,148,7,49418,6.751053
4,148,7,916,6.811444
...,...,...,...,...
1582153,33372,7,35486,6.959330
1582154,33372,7,72872,6.631885
1582155,33372,7,39039,7.066941
1582156,33372,9,71196,6.825010


In [225]:
predictions_df.isnull().sum()

anime_id      0
rating        0
user_id       0
prediction    0
dtype: int64

In [226]:
rmse = np.sqrt(mean_squared_error(predictions_df['rating'],predictions_df['prediction']))
rmse

1.1653227752840312

## Surprise Package

In [198]:
from surprise import SVDpp, NormalPredictor, Dataset, Reader, accuracy
from surprise import KNNBaseline, NMF, SVD
from surprise.model_selection import KFold, cross_validate, GridSearchCV, train_test_split

In [227]:
reader = Reader(rating_scale=(1, 10))
all_data = Dataset.load_from_df(rating_df, reader)
train_surprise = Dataset.load_from_df(train, reader)
test_surprise = Dataset.load_from_df(test, reader)

In [228]:
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(train_surprise):

    # train and test algorithm.
    algo.fit(trainset)
    preds_surprise = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(preds_surprise, verbose=True)

RMSE: 42352.0111
RMSE: 42312.8853
RMSE: 42306.0748


In [229]:
os.system("say 'Model Complete'") 

0

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])