In [2]:


import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
import hashlib
import numpy as np

# recommendations
import surprise
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.prediction_algorithms import BaselineOnly, KNNBaseline, Prediction
from surprise import SVD

# metrics
import ml_metrics as metrics

# eda
from data_utils import describe, transformers

# visualization
from matplotlib import pyplot as plt

from numba import jit

### Explore

In [2]:
!ls -lah data/

total 295736
drwxr-xr-x  10 sofiacardita  staff   340B Aug 26 11:32 [1m[36m.[m[m
drwxr-xr-x  10 sofiacardita  staff   340B Aug 26 12:59 [1m[36m..[m[m
-rw-r--r--@  1 sofiacardita  staff   6.0K Aug 26 11:32 .DS_Store
-rw-r--r--   1 sofiacardita  staff   6.6K Aug 26 11:01 example_output.csv
-rw-r--r--   1 sofiacardita  staff    19M Aug 26 11:01 song_tag.zip
-rw-r--r--   1 sofiacardita  staff   9.5M Aug 26 11:01 songs.txt
-rw-r--r--   1 sofiacardita  staff    12M Aug 26 11:01 tags.csv
-rw-r--r--   1 sofiacardita  staff   400K Aug 26 11:01 test_users.csv
-rw-r--r--   1 sofiacardita  staff    86M Mar 11  2012 train_play_counts.txt
-rw-r--r--   1 sofiacardita  staff    18M Aug 26 11:01 train_play_counts.zip


In [3]:
!head data/train_play_counts.txt

fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOBONKR12A58A7A7E0	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOEGIYH12A6D4FC0E3	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOFLJQZ12A6D4FADA6	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOHTKMO12AB01843B0	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SODQZCY12A6D4F9D11	1
fd50c4007b68a3737fe052d5a4f78ce8aa117f3d	SOXLOQG12AF72A2D55	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOUVUHC12A67020E3B	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOUQERE12A58A75633	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOIPJAX12A8C141A2D	1
d7083f5e1d50c264277d624340edaaf3dc16095b	SOEFCDJ12AB0185FA0	2


In [4]:
!head data/songs.txt

SOAAADD12AB018A9DD 1
SOAAADE12A6D4F80CC 2
SOAAADF12A8C13DF62 3
SOAAADZ12A8C1334FB 4
SOAAAFI12A6D4F9C66 5
SOAAAGK12AB0189572 6
SOAAAGN12AB017D672 7
SOAAAGO12A67AE0A0E 8
SOAAAGP12A6D4F7D1C 9
SOAAAGQ12A8C1420C8 10


### With surprise


In [3]:
def load_file(file, explore=False, delimeter=" ", names=[]):
    ds = pd.read_csv(file, sep=delimeter, names=names, header=0)
    if explore:
        describe.describe_data(ds, file)
    return ds

In [4]:
items = load_file("data/songs.txt", explore=False, delimeter=" ", names=["iid", "item_index"])
ratings = load_file("data/train_play_counts.txt", explore=False, delimeter="\t", names=["uid", "iid", "totals"])

In [None]:
ratings.head()

In [5]:
ratings = ratings.drop_duplicates()
items = items.drop_duplicates()
# random
items = items.sample(frac=1)

In [6]:
test_users = pd.read_csv("data/test_users.csv",sep=" ", names=['uid'])

Unnamed: 0,uid
0,0007140a3796e901f3190f12e9de6d7548d4ac4a
1,000b22f91d4992dba3a80025493059f972a7850e
2,001eed159e8a038e5c5c63fe439ea5acead124f4


In [7]:
ratings.head()

Unnamed: 0,uid,iid,totals
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOXLOQG12AF72A2D55,1


In [8]:
def keep_only_interesting_users(ratings, users):
    users_with_songs = ratings.groupby('uid').totals.sum()
    users_with_songs = users_with_songs[users_with_songs > 30]
    return ratings[ np.array([element in users_with_songs.index for element in ratings.uid])  | np.array([element in users.index for element in ratings.uid])]

In [9]:
ratings_filtered = keep_only_interesting_users(ratings,test_users)

In [10]:
rscale = (ratings_filtered.totals.min(),ratings.totals.max())
rscale

(1, 923)

In [11]:
def prep_dataset(ratings, rscale):
    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=rscale) # see df.col.hist()

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(ratings[['uid', 'iid', 'totals']], reader)
    return data

In [12]:
data = prep_dataset(ratings_filtered, rscale)

In [13]:
def baseline_cross_validate(data, bsl_options=None, cv=5):
    if not bsl_options:
        bsl_options = {'method': 'sgd', 'learning_rate': 0.001, 'reg': 0.05}
    
    baseline = BaselineOnly(bsl_options=bsl_options)
    
    res = cross_validate(baseline, data, measures=['RMSE', 'MAE'], cv=cv, n_jobs=-1)
    print("rmse: %s" % res["test_rmse"].mean())
    print("mae: %s" % res["test_mae"].mean())
    return baseline, res


# You must decide whether or not to use a `Dataset` or a `Trainset`.
baseline, baseline_results = baseline_cross_validate(data)

rmse: 8.553156653229696
mae: 3.7291974355902098


In [14]:
@jit 
def f1():
    test = []
    for uid in list(test_users.index):
        for iid in list(items.iid)[:1000]:
            test.append((uid,iid,1))
    return test

In [15]:

def make_predictions(algo, ratings_train, test_users):
    
    
    algo.fit(ratings_train)
    print('end fit')
    #test_set = ratings_train.build_anti_testset()
    #uuid, iid, fill
    test = f1()
    print('list')
    #test = [item for item in test_set if item[0] in list(test_users.index)]

    preds = algo.test(test)
    return algo, preds


model, preds = make_predictions(baseline, data.build_full_trainset(), list(test_users))

Estimating biases using sgd...
end fit
list


In [16]:
items_2 = items.set_index('iid')

In [17]:
def get_top_n(predictions, n=500):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [34]:
def to_csv(top_n, filename="preds.csv"):
    users = []
    for uid, recs in top_n.items():
        items = [rec[0] for rec in recs]
        urec = (uid, items)
        users.append(urec)
    # sort by user
    recommendations = sorted(users, key=lambda x: x[0])
    
    # do csv
    # User, songid songid 
    s = "User: \n"
    for rec in recommendations:
        user = str(rec[0])
        recs = " ".join(str(r) for r in rec[1])
        s += "{user},{recs}\n".format(user=user, recs=recs)
    #print(s)
    with open(filename,'w') as f:
        f.write(s)
    return recommendations

In [19]:
def get_output():
    with open('output_2.txt','w') as f:
        for key, item in a.items():
            f.write(str(key)+','+",".join([str(items_2.loc[e[0]].values[0]) for e in item]) + '\n')

9999

In [20]:
get_output()

In [33]:
with open('output_2.txt') as f:
    with open('output_3.txt','w') as f2:
        lines = f.readlines()
        for line in lines:
            l_split = line.split(',')
            f2.write(str(test_users.loc[int(l_split[0])].values[0])+','+",".join(l_split[1:]))
    

In [87]:
t = ratings.groupby('iid').totals.sum().nlargest(3000).to_frame().join(items_2)

In [92]:
with open('output_5.txt','w') as f:
    for element in test_users.values:
        f.write(str(element[0])+','+','.join([str(iid) for iid in t.sample(n=500).values]) + "\n")

AssertionError: Failed at object (analyzing bytecode)
SETUP_WITH(arg=78, lineno=3)