In [15]:


import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
import hashlib
import numpy as np

# recommendations
import surprise
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.prediction_algorithms import BaselineOnly, KNNBaseline, Prediction
from surprise import SVD

# metrics
import ml_metrics as metrics

# eda
from data_utils import describe, transformers

# visualization
from matplotlib import pyplot as plt



### Explore

In [36]:
!ls -lah data/

total 453776
drwxr-xr-x  11 sofiacardita  staff   374B Aug 26 10:04 [1m[36m.[m[m
drwxr-xr-x   9 sofiacardita  staff   306B Aug 26 10:34 [1m[36m..[m[m
-rw-r--r--@  1 sofiacardita  staff   6.0K Aug 26 10:04 .DS_Store
-rw-r--r--   1 sofiacardita  staff   6.6K Aug 26 09:55 example_output.csv
-rw-r--r--   1 sofiacardita  staff    77M Jul 28 12:30 song_tag.csv
-rw-r--r--   1 sofiacardita  staff    19M Aug 26 09:55 song_tag.zip
-rw-r--r--   1 sofiacardita  staff   9.5M Aug 26 09:55 songs.txt
-rw-r--r--   1 sofiacardita  staff    12M Aug 26 09:55 tags.csv
-rw-r--r--   1 sofiacardita  staff   400K Aug 26 09:55 test_users.csv
-rw-r--r--   1 sofiacardita  staff    86M Mar 11  2012 train_play_counts.txt
-rw-r--r--   1 sofiacardita  staff    18M Aug 26 09:55 train_play_counts.zip


In [3]:
!head data/train_play_counts.txt

fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOBONKR12A58A7A7E0	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOEGIYH12A6D4FC0E3	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOFLJQZ12A6D4FADA6	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOHTKMO12AB01843B0	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SODQZCY12A6D4F9D11	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOXLOQG12AF72A2D55	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOUVUHC12A67020E3B	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOUQERE12A58A75633	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOIPJAX12A8C141A2D	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOEFCDJ12AB0185FA0	2


In [4]:
!head data/songs.txt

SOAAADD12AB018A9DD 1
SOAAADE12A6D4F80CC 2
SOAAADF12A8C13DF62 3
SOAAADZ12A8C1334FB 4
SOAAAFI12A6D4F9C66 5
SOAAAGK12AB0189572 6
SOAAAGN12AB017D672 7
SOAAAGO12A67AE0A0E 8
SOAAAGP12A6D4F7D1C 9
SOAAAGQ12A8C1420C8 10


In [4]:
def make_ratings(path):

    users = make_users()
    items = make_items()

    #users_ = read_array_from_csv(path, 'object', 0, delimiter="\t")
    items_ = read_array_from_csv(path, 'object', 1)

    rows = make_rows(users)
    cols = make_cols(items, items_)

    nrows = users.shape[0]
    ncols = items.shape[0]

    shape = (nrows, ncols)

    data = np.ones(rows.size)

    return coo_matrix((data, (rows, cols)), shape=shape)


def make_users():
    path = os.path.join('data', 'train_play_counts.txt')
    users = read_array_from_csv(path, 'object', 0, delimiter="\t")
    return users[users.argsort()]


def make_items():
    path = os.path.join('data', 'songs.txt')
    items = read_array_from_csv(path, 'object', 0, delimiter=" ")
    return items[items.argsort()]


def read_array_from_csv(path, dtype, column, delimiter=" "):
    return np.genfromtxt(path, dtype=dtype, skip_header=False, usecols=[column],
                         delimiter=delimiter)

def make_rows(users):
    rows = np.unique(users)
    #rows = [np.argwhere(users == u)[0, 0] for u in users_]
    return rows


def make_cols(items, items_):
    cols = [np.argwhere(items == i)[0, 0] for i in items_]
    return np.array(cols)

In [None]:
R = make_ratings("data/train_play_counts.txt")

In [6]:
path = os.path.join('data', 'songs.txt')
items = read_array_from_csv(path, 'object', 0, delimiter=" ")

b'SOAAADD12AB018A9DD'

### With surprise


In [27]:
def load_file(file, explore=False, delimeter=" ", names=[]):
    ds = pd.read_csv(file, sep=delimeter, names=names, header=0)
    if explore:
        describe.describe_data(ds, file)
    return ds

In [28]:
items = load_file("data/songs.txt", explore=True, delimeter=" ", names=["iid", "item_index"])
ratings = load_file("data/train_play_counts.txt", explore=True, delimeter="\t", names=["uid", "iid", "totals"])

Describing 'data/songs.txt'
type:  class 'pandas.core.frame.DataFrame'
shape:  (386212, 2)
.dtypes:
0. iid : object (total_values: 386212; values: -) (nans: 0) -------- Object!!
1. item_index : int64 (total_values: 386212; values: -) (nans: 0)
Total nans 0
.index.names:  [None]
Describing 'data/train_play_counts.txt'
type:  class 'pandas.core.frame.DataFrame'
shape:  (1450932, 3)
.dtypes:
0. uid : object (total_values: 110000; values: -) (nans: 0) -------- Object!!
1. iid : object (total_values: 163206; values: -) (nans: 0) -------- Object!!
2. totals : int64 (total_values: 299; values: -) (nans: 0)
Total nans 0
.index.names:  [None]


In [29]:
ratings.head()

Unnamed: 0,uid,iid,totals
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOXLOQG12AF72A2D55,1


In [20]:
ratings = ratings.drop_duplicates()
items = items.drop_duplicates()


In [36]:
rscale = (ratings.totals.min(),ratings.totals.max())
rscale

(1, 923)

In [38]:
def prep_dataset(ratings, rscale):
    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=rscale) # see df.col.hist()

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(ratings[['uid', 'iid', 'totals']], reader)
    return data

In [40]:
data = prep_dataset(ratings, rscale)

In [39]:
def baseline_cross_validate(data, bsl_options=None, cv=5):
    if not bsl_options:
        bsl_options = {'method': 'sgd', 'learning_rate': 0.00005, 'reg': 0.05}
    
    baseline = BaselineOnly(bsl_options=bsl_options)
    
    res = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=cv, n_jobs=-1)
    print("rmse: %s" % res["test_rmse"].mean())
    print("mae: %s" % res["test_mae"].mean())
    return baseline, res


# You must decide whether or not to use a `Dataset` or a `Trainset`.
baseline, baseline_results = baseline_cross_validate(data)

rmse: 7.004295561788455
mae: 2.909209271160712


In [None]:
def make_predictions(algo, ratings_train):
    
    
    algo.fit(ratings_train)
    
    test_set = ratings_train.build_anti_testset()
    
    preds = algo.test(test_set)
    return algo, preds

    
model, preds = make_predictions(baseline, data.build_full_trainset())

Estimating biases using sgd...


In [None]:
def get_top_n(predictions, n=500):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
def to_csv(top_n, filename="preds.csv"):
    users = []
    for uid, recs in top_n.items():
        items = [rec[0] for rec in recs]
        urec = (uid, items)
        users.append(urec)
    # sort by user
    recommendations = sorted(users, key=lambda x: x[0])
    
    # do csv
    # User, songid songid 
    s = "User: \n"
    for rec in recommendations:
        user = str(rec[0])
        recs = " ".join(str(r) for r in rec[1])
        s += "{user},{recs}\n".format(user=user, recs=recs)
    #print(s)
    with open(filename,'w') as f:
        f.write(s)
    return recommendations