# Load the required modules

In [1]:

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split


# load the data

In [2]:

transactions = pd.read_csv('transaction_data.csv')

In [3]:
data = transactions

In [4]:
customer = data['customerid'].unique()

In [5]:
customers = pd.DataFrame(customer, columns=['customerid'])

# create dummy data

In [6]:

def create_data_dummy(datas):
    data_dummy = datas.copy()
    data_dummy ['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)
    

In [7]:
data_dummy

Unnamed: 0,customerid,productid,purchase_count,purchase_dummy
0,11000,344,1,1
1,11000,353,1,1
2,11000,485,1,1
3,11000,488,1,1
4,11000,530,1,1
5,11000,541,1,1
6,11000,573,1,1
7,11000,214,1,1
8,11001,217,1,1
9,11001,350,1,1


# Normalise item values across users

In [8]:

df_matrix = pd.pivot_table(data, values='purchase_count', index ='customerid', columns='productid')

In [9]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

# create a table for input to the modeling

In [10]:

d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchased-freq']
data_norm = pd.melt(d,id_vars=['customerid'],
value_name ='scaled_purchase_freq').dropna()

print(data_norm.shape)
data_norm.head()

(40031, 3)


Unnamed: 0,customerid,productid,scaled_purchase_freq
0,11000,214,0.0
4,11004,214,0.0
7,11007,214,0.0
8,11008,214,0.0
19,11019,214,0.0


# Split data into training and testing set

In [11]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [12]:
data

Unnamed: 0,customerid,productid,purchase_count
0,11000,344,1
1,11000,353,1
2,11000,485,1
3,11000,488,1
4,11000,530,1
5,11000,541,1
6,11000,573,1
7,11000,214,1
8,11001,217,1
9,11001,350,1


In [13]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [14]:
# constant variables to define field names include:
user_id = 'customerid'
item_id = 'productid'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

# training data using 3 different algorithms

In [15]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                        user_id=user_id, 
                                                        item_id=item_id, 
                                                        target=target, 
                                                        similarity_type='pearson')

    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

# training data in each target using popularity algorithm

In [16]:
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerid | productid |       score        | rank |
+------------+-----------+--------------------+------+
|   11000    |    480    | 1.086067319982957  |  1   |
|   11000    |    529    | 1.0600109110747409 |  2   |
|   11000    |    528    | 1.046402724563644  |  3   |
|   11000    |    539    | 1.0446927374301676 |  4   |
|   11000    |    477    | 1.043200486766048  |  5   |
|   11000    |    222    | 1.0374923171481254 |  6   |
|   11000    |    484    | 1.0363372093023255 |  7   |
|   11000    |    535    | 1.0308823529411764 |  8   |
|   11000    |    217    | 1.0291201982651796 |  9   |
|   11000    |    225    | 1.0267538644470868 |  10  |
|   11001    |    480    | 1.086067319982957  |  1   |
|   11001    |    529    | 1.0600109110747409 |  2   |
|   11001    |    528    | 1.046402724563644  |  3   |
|   11001    |    539    | 1.0446927374301676 |  4   |
|   11001    |    477    | 1.043200486766048  |  5   |
|   11001 

In [17]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)


+------------+-----------+-------+------+
| customerid | productid | score | rank |
+------------+-----------+-------+------+
|   11000    |    477    |  1.0  |  1   |
|   11000    |    484    |  1.0  |  2   |
|   11000    |    465    |  1.0  |  3   |
|   11000    |    478    |  1.0  |  4   |
|   11000    |    333    |  1.0  |  5   |
|   11000    |    536    |  1.0  |  6   |
|   11000    |    579    |  1.0  |  7   |
|   11000    |    390    |  1.0  |  8   |
|   11000    |    540    |  1.0  |  9   |
|   11000    |    217    |  1.0  |  10  |
|   11001    |    537    |  1.0  |  1   |
|   11001    |    484    |  1.0  |  2   |
|   11001    |    465    |  1.0  |  3   |
|   11001    |    333    |  1.0  |  4   |
|   11001    |    536    |  1.0  |  5   |
|   11001    |    579    |  1.0  |  6   |
|   11001    |    390    |  1.0  |  7   |
|   11001    |    488    |  1.0  |  8   |
|   11001    |    540    |  1.0  |  9   |
|   11001    |    214    |  1.0  |  10  |
|   11002    |    465    |  1.0  |

In [18]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerid | productid |        score         | rank |
+------------+-----------+----------------------+------+
|   11000    |    486    | 0.02197802197802198  |  1   |
|   11000    |    539    |         0.02         |  2   |
|   11000    |    540    | 0.016511867905056758 |  3   |
|   11000    |    467    | 0.014450867052023121 |  4   |
|   11000    |    561    | 0.014285714285714285 |  5   |
|   11000    |    463    | 0.013157894736842105 |  6   |
|   11000    |    465    | 0.013089005235602094 |  7   |
|   11000    |    481    | 0.013043478260869565 |  8   |
|   11000    |    491    | 0.012422360248447204 |  9   |
|   11000    |    483    | 0.011857707509881422 |  10  |
|   11001    |    486    | 0.02197802197802198  |  1   |
|   11001    |    539    |         0.02         |  2   |
|   11001    |    540    | 0.016511867905056758 |  3   |
|   11001    |    467    | 0.014450867052023121 |  4   |
|   11001    |    561    | 0.01

# Collaborative filtering models

In [19]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerid | productid |        score         | rank |
+------------+-----------+----------------------+------+
|   11000    |    480    | 0.10125453770160675  |  1   |
|   11000    |    477    | 0.09121781587600708  |  2   |
|   11000    |    478    | 0.08351795375347137  |  3   |
|   11000    |    528    | 0.08085720986127853  |  4   |
|   11000    |    222    | 0.07712101936340332  |  5   |
|   11000    |    529    | 0.07613804936408997  |  6   |
|   11000    |    225    | 0.07359151542186737  |  7   |
|   11000    |    537    | 0.07198265194892883  |  8   |
|   11000    |    217    | 0.06905513256788254  |  9   |
|   11000    |    487    | 0.058402761816978455 |  10  |
|   11001    |    477    | 0.22427197865077428  |  1   |
|   11001    |    528    | 0.09053138324192592  |  2   |
|   11001    |    214    | 0.08361170121601649  |  3   |
|   11001    |    222    | 0.07805293798446655  |  4   |
|   11001    |    537    | 0.07

In [20]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerid | productid |        score         | rank |
+------------+-----------+----------------------+------+
|   11000    |    477    | 0.07888364046812057  |  1   |
|   11000    |    478    |  0.078419528901577   |  2   |
|   11000    |    480    | 0.07045881450176239  |  3   |
|   11000    |    225    | 0.06927888095378876  |  4   |
|   11000    |    537    | 0.06604594737291336  |  5   |
|   11000    |    222    | 0.06296557188034058  |  6   |
|   11000    |    528    | 0.061501286923885345 |  7   |
|   11000    |    479    | 0.05382126569747925  |  8   |
|   11000    |    487    | 0.05250490456819534  |  9   |
|   11000    |    217    | 0.04830383509397507  |  10  |
|   11001    |    479    | 0.11889280378818512  |  1   |
|   11001    |    528    | 0.07885675877332687  |  2   |
|   11001    |    537    | 0.07849996536970139  |  3   |
|   11001    |    222    | 0.07350622862577438  |  4   |
|   11001    |    214    |  0.0

In [21]:
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-----------------------+------+
| customerid | productid |         score         | rank |
+------------+-----------+-----------------------+------+
|   11000    |    228    |          0.0          |  1   |
|   11000    |    480    |          0.0          |  2   |
|   11000    |    490    |          0.0          |  3   |
|   11000    |    529    |          0.0          |  4   |
|   11000    |    538    |          0.0          |  5   |
|   11000    |    222    |          0.0          |  6   |
|   11000    |    465    |          0.0          |  7   |
|   11000    |    477    |          0.0          |  8   |
|   11000    |    540    |          0.0          |  9   |
|   11000    |    528    |          0.0          |  10  |
|   11001    |    540    |  0.009339610735575357 |  1   |
|   11001    |    529    |  0.00846711794535319  |  2   |
|   11001    |    217    | 0.0075136224428812666 |  3   |
|   11001    |    535    | 0.0074531833330790205 |  4   |
|   11001    |

In [22]:
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerid | productid |       score        | rank |
+------------+-----------+--------------------+------+
|   11000    |    480    | 1.080391661399314  |  1   |
|   11000    |    529    | 1.0533782179628457 |  2   |
|   11000    |    528    | 1.041674399852322  |  3   |
|   11000    |    477    | 1.0406209467517833 |  4   |
|   11000    |    539    | 1.0384664213357693 |  5   |
|   11000    |    222    | 1.0335896955808759 |  6   |
|   11000    |    484    | 1.0335470413745838 |  7   |
|   11000    |    217    | 1.0264670986686795 |  8   |
|   11000    |    535    | 1.0255460906554668 |  9   |
|   11000    |    225    | 1.0240146435894661 |  10  |
|   11001    |    480    | 1.0833192818953827 |  1   |
|   11001    |    529    | 1.058114546726467  |  2   |
|   11001    |    528    | 1.0443920810814364 |  3   |
|   11001    |    539    | 1.041689804336688  |  4   |
|   11001    |    477    | 1.0415250257462618 |  5   |
|   11001 

In [23]:
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerid | productid | score | rank |
+------------+-----------+-------+------+
|   11000    |    477    |  0.0  |  1   |
|   11000    |    484    |  0.0  |  2   |
|   11000    |    465    |  0.0  |  3   |
|   11000    |    478    |  0.0  |  4   |
|   11000    |    333    |  0.0  |  5   |
|   11000    |    536    |  0.0  |  6   |
|   11000    |    579    |  0.0  |  7   |
|   11000    |    390    |  0.0  |  8   |
|   11000    |    540    |  0.0  |  9   |
|   11000    |    217    |  0.0  |  10  |
|   11001    |    537    |  0.0  |  1   |
|   11001    |    484    |  0.0  |  2   |
|   11001    |    465    |  0.0  |  3   |
|   11001    |    333    |  0.0  |  4   |
|   11001    |    536    |  0.0  |  5   |
|   11001    |    579    |  0.0  |  6   |
|   11001    |    390    |  0.0  |  7   |
|   11001    |    488    |  0.0  |  8   |
|   11001    |    540    |  0.0  |  9   |
|   11001    |    214    |  0.0  |  10  |
|   11002    |    465    |  0.0  |

In [24]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerid | productid |        score         | rank |
+------------+-----------+----------------------+------+
|   11000    |    486    | 0.021947981237055186 |  1   |
|   11000    |    539    | 0.018843276500701906 |  2   |
|   11000    |    540    | 0.015141092205441284 |  3   |
|   11000    |    561    | 0.014285106318337577 |  4   |
|   11000    |    467    | 0.014211470956747236 |  5   |
|   11000    |    481    | 0.013025334606999938 |  6   |
|   11000    |    465    | 0.01278133798020049  |  7   |
|   11000    |    463    | 0.01254470599325079  |  8   |
|   11000    |    491    | 0.012365866966129086 |  9   |
|   11000    |    483    | 0.011752374181634344 |  10  |
|   11001    |    486    | 0.024872244714380626 |  1   |
|   11001    |    539    | 0.02477716326713562  |  2   |
|   11001    |    540    | 0.02462709989587097  |  3   |
|   11001    |    535    | 0.018592558402583427 |  4   |
|   11001    |    463    | 0.01

In [25]:
popularity_model =popularity

In [26]:
models_w_counts = [popularity_model, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [27]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.08354372274500814 | 0.06291713571665368 |
|   2    | 0.07005967408767494 | 0.10465079000768696 |
|   3    | 0.06939025323234643 | 0.15576016153535094 |
|   4    |  0.0562887307780583 | 0.16641246889697678 |
|   5    | 0.06359880651824623 | 0.23536896473836996 |
|   6    | 0.06112768724657635 | 0.27162648319981353 |
|   7    | 0.05939539001278737 |  0.3089449198698683 |
|   8    | 0.05687686481523985 |  0.3365829845058675 |
|   9    | 0.05289062300767605 |  0.3500940832304155 |
|   10   | 0.05078035345421155 |  0.3740868553067317 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2671344715660716

Per User RMSE (best)
+------------+------+-------+
| customerid | rmse | count |
+------------+------+-------+
|   12004 


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.26807436309387234 | 0.20951155957754505 |
|   2    | 0.19061280697727795 | 0.29094678878360325 |
|   3    | 0.15167163950730647 |  0.3451084461785633 |
|   4    | 0.12901652513197137 |  0.3912179269994281 |
|   5    |  0.1136102823043378 |  0.4283719229410075 |
|   6    | 0.10224925407390402 | 0.46159664615597634 |
|   7    |  0.093003049280304  | 0.48818280927243546 |
|   8    | 0.08669956392012851 |  0.5199988797446906 |
|   9    | 0.08024124652538694 |  0.5386237162602785 |
|   10   | 0.07458113380766593 |  0.5553804132011606 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9632051196852778

Per User RMSE (best)
+------------+---------------------+-------+
| customerid |         rmse        | count |
+---------


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.06873995868717012  | 0.05283073944136597 |
|   2    | 0.05399357355978886  | 0.08081799585414454 |
|   3    | 0.053285900084155645 | 0.11897694277772326 |
|   4    | 0.05144021115446409  | 0.15347934904969565 |
|   5    | 0.05327059903603376  | 0.19772783989274695 |
|   6    | 0.05053171142223246  |  0.2243153691742168 |
|   7    | 0.04895242466966143  | 0.25304367902772734 |
|   8    | 0.04624741794812945  |  0.2728407762003132 |
|   9    | 0.044079769464208295 | 0.29285099511459406 |
|   10   | 0.04213908652742716  | 0.31126894870870103 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2466098741004129

Per User RMSE (best)
+------------+------+-------+
| customerid | rmse | count |
+------------+------+-----


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.051757700675598296 | 0.03763147993652973 |
|   2    | 0.04935302874155502  |  0.0727953084359818 |
|   3    | 0.03820756517424325  | 0.08470797685844064 |
|   4    | 0.030602313065384235 | 0.09068912117692612 |
|   5    | 0.025420817588457434 | 0.09435720011123659 |
|   6    | 0.02242452001984806  |  0.0996503410708152 |
|   7    | 0.02241088809278418  | 0.11663872121618611 |
|   8    | 0.02161342035955568  |  0.1267803296745243 |
|   9    | 0.023512347799534657 | 0.15322790400942263 |
|   10   | 0.02337112103515381  | 0.16907297443195748 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| customerid | rmse | count |
+------------+------+-------+
|   16436  


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    |  0.2816901408450708 | 0.21896282846128212 |
|   2    | 0.20130539333562353 | 0.30793309995474244 |
|   3    |  0.1595099049582048 |  0.3645948863915204 |
|   4    | 0.13403183327607932 | 0.40768077298479294 |
|   5    | 0.11805794114279174 |  0.448540429569286  |
|   6    | 0.10611092026413223 |  0.4830608311113295 |
|   7    | 0.09695571804812578 |  0.5139695898971062 |
|   8    | 0.08865796404442913 |  0.5360841198955254 |
|   9    |  0.0820896471875516 |  0.5569686411149819 |
|   10   | 0.07615939539677095 |  0.5735047139203787 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9125091421482661

Per User RMSE (best)
+------------+-------------------+-------+
| customerid |        rmse       | count |
+------------+


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.05175770067559829  | 0.03763147993652976 |
|   2    | 0.049353028741555005 | 0.07279530843598184 |
|   3    | 0.03820756517424327  | 0.08470797685844077 |
|   4    | 0.030602313065384155 | 0.09068912117692612 |
|   5    | 0.025420817588457455 | 0.09435720011123667 |
|   6    | 0.022424520019848057 | 0.09965034107081518 |
|   7    | 0.02241088809278413  | 0.11663872121618617 |
|   8    | 0.021613420359555723 | 0.12678032967452413 |
|   9    | 0.023512347799534684 | 0.15322790400942213 |
|   10   | 0.023371121035153826 |  0.1690729744319577 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| customerid | rmse | count |
+------------+------+-------+
|   16436  


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.009737248840803707 | 0.007677927430632224 |
|   2    | 0.018701700154559502 | 0.028995731213660128 |
|   3    | 0.021432251416795455 | 0.04958820931773018  |
|   4    | 0.019204018547140624 | 0.05982777655111503  |
|   5    | 0.016754250386398894 | 0.06436409803488609  |
|   6    | 0.01656362699639364  | 0.07596121292411867  |
|   7    | 0.01651578714948111  | 0.08819864576433363  |
|   8    | 0.015996908809891797 |  0.097302200632958   |
|   9    | 0.015936802335566037 | 0.10913777875910796  |
|   10   | 0.015996908809891783 | 0.12220210495326414  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.06612971082198302

Per User RMSE (best)
+------------+----------------------+-------+
| customerid |         rm


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.08315301391035552  | 0.06451203356149252 |
|   2    | 0.06754250386398775  | 0.10229300066239788 |
|   3    | 0.08083462132921146  | 0.19089570913373083 |
|   4    | 0.06649922720247314  | 0.20735519246338385 |
|   5    | 0.06380216383307549  | 0.24955140943548962 |
|   6    | 0.06164348274085522  | 0.28770847133289146 |
|   7    | 0.06228748068006197  | 0.34202583351733273 |
|   8    | 0.06163060278207113  | 0.38826746154412284 |
|   9    | 0.06172076249355999  |  0.4410973724884082 |
|   10   | 0.060927357032457594 | 0.48828181349819677 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.06183381961328656

Per User RMSE (best)
+------------+------+-------+
| customerid | rmse | count |
+------------+------+----


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.01282843894899537  | 0.009112754839184518 |
|   2    | 0.01924265842349305  | 0.02772576727754472  |
|   3    | 0.01885625965996905  | 0.03978803267829542  |
|   4    | 0.018083462132921155 |  0.0499698240965629  |
|   5    | 0.017496136012364837 |  0.060780157503496   |
|   6    | 0.018031942297784695 | 0.07528188709796141  |
|   7    | 0.01761978361669239  | 0.08638073158165892  |
|   8    |  0.0178323029366306  | 0.10092478104070073  |
|   9    | 0.017757169843723334 |  0.1148145285935085  |
|   10   | 0.017851622874806714 | 0.12979392065945417  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.06158153457025342

Per User RMSE (best)
+------------+------------------------+-------+
| customerid |         

# we select Cosine similarity on Purchase dummy approach as our final model

In [28]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerid | productid |        score         | rank |
+------------+-----------+----------------------+------+
|   11000    |    478    | 0.10096639394760132  |  1   |
|   11000    |    477    |  0.0984770879149437  |  2   |
|   11000    |    225    | 0.08770759403705597  |  3   |
|   11000    |    480    | 0.08636465668678284  |  4   |
|   11000    |    537    | 0.08083739131689072  |  5   |
|   11000    |    528    | 0.07649999111890793  |  6   |
|   11000    |    217    | 0.07149147987365723  |  7   |
|   11000    |    479    | 0.06456053256988525  |  8   |
|   11000    |    487    | 0.06324635446071625  |  9   |
|   11000    |    467    | 0.05609729140996933  |  10  |
|   11001    |    537    | 0.09024447798728943  |  1   |
|   11001    |    214    | 0.08879245519638061  |  2   |
|   11001    |    528    | 0.08163455724716187  |  3   |
|   11001    |    487    | 0.07803999781608581  |  4   |
|   11001    |    222    | 0.07

In [29]:
#csv output file
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(184840, 4)


Unnamed: 0,customerid,productid,score,rank
0,11000,478,0.100966,1
1,11000,477,0.098477,2
2,11000,225,0.087708,3
3,11000,480,0.086365,4
4,11000,537,0.080837,5


In [30]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerid', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerid').set_index('customerid')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [31]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(18484, 1)


Unnamed: 0_level_0,recommendedProducts
customerid,Unnamed: 1_level_1
11000,486|539|540|561|467|481|465|463|491|483
11001,486|539|540|535|463|529|231|481|528|467
11002,486|539|540|467|481|465|491|463|483|231
11003,486|539|561|467|540|481|465|491|463|483
11004,486|539|540|561|467|481|465|491|463|483
