### Purchase Prediction Problem - Collaborative Filtering

After some difficulty getting matrix factorization to work, I am now attempting to follow a tutorial I found online [here](https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6). Note that the `turicreate` package only works on Linux. Fortunatelt, Google Colab's VMs run Linux.

To further help, I have cut the training data in half by removing users who do not appear in the test data.

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
!pip install turicreate

In [1]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('/content/drive/My Drive/train_reduced.csv') 
data

Unnamed: 0,ReviewerID,ItemID,TestReviewCount
0,U490934656,I402344648,1
1,U490934656,I330290793,1
2,U490934656,I296399509,1
3,U361187730,I773829721,1
4,U361187730,I781019543,1
...,...,...,...
103532,U804031374,I581484959,1
103533,U916222473,I681131803,3
103534,U884804801,I848045217,1
103535,U874043847,I063345855,3


In [3]:
#Import Test data

test_allcol = pd.read_csv('/content/drive/My Drive/pairs_Purchase.txt')
pairings = list(test_allcol['reviewerID-itemID'])
test_users = []
test_items = []

for pairing in pairings:
    u, i = pairing.split('-')
    test_users.append(u)
    test_items.append(i)

In [4]:
print(len(test_users))
print(len(test_items))

28000
28000


In [5]:
# Convert to Pandas df

test_df = pd.DataFrame(test_users, columns = ['ReviewerID'])
test_df['ItemID'] = pd.DataFrame(test_items)

test_df

Unnamed: 0,ReviewerID,ItemID
0,U938994110,I529819131
1,U181459539,I863471064
2,U941668816,I684585522
3,U768449391,I782253949
4,U640450168,I232683472
...,...,...
27995,U337041888,I763827121
27996,U457455307,I242828364
27997,U052546714,I111529174
27998,U566804667,I857242737


In [6]:
# Create purchase count
data['PurchaseCount'] = 1

In [7]:
# Create interaction matrix using pandas pivot
df_matrix = pd.pivot_table(data, values='PurchaseCount', index='ReviewerID', columns='ItemID')


In [None]:
# The matrix will have NaN instead of 0
df_matrix.head()

In [8]:
# Normalize purchase counts (all are 1 in this data, but this can be useful for other datasets, so we'll include it)
# df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [8]:
# create a table for input to the modeling  
d = df_matrix.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['ReviewerID'], value_name='scaled_purchase_freq').dropna()

In [9]:
print(data_norm.shape)
data_norm.head()

(103537, 3)


Unnamed: 0,ReviewerID,ItemID,scaled_purchase_freq
6591,U328848199,I000024906,1.0
7760,U388298041,I000024906,1.0
16031,U804451000,I000024906,1.0
16369,U822597978,I000024906,1.0
25004,U257785428,I000030838,1.0


In [10]:
# Split into test and train

def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [11]:
# train_data, test_data = split_data(data)
# train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

#### Define models using Turicreate library

In [13]:
# constant variables to define field names include:
user_id = 'ReviewerID'
item_id = 'ItemID'
users_to_recommend = list(test_df['ReviewerID'])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [15]:
# Define turicreate model

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [29]:
# First model is baseline based on item popularity

name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+------------+-------+------+
| ReviewerID |   ItemID   | score | rank |
+------------+------------+-------+------+
| U938994110 | I023839718 |  1.0  |  1   |
| U938994110 | I388274815 |  1.0  |  2   |
| U938994110 | I937416759 |  1.0  |  3   |
| U938994110 | I123527101 |  1.0  |  4   |
| U938994110 | I642025511 |  1.0  |  5   |
| U938994110 | I742902290 |  1.0  |  6   |
| U938994110 | I800835871 |  1.0  |  7   |
| U938994110 | I241601795 |  1.0  |  8   |
| U938994110 | I053068338 |  1.0  |  9   |
| U938994110 | I882612283 |  1.0  |  10  |
| U938994110 | I039860703 |  1.0  |  11  |
| U938994110 | I882303474 |  1.0  |  12  |
| U938994110 | I838606136 |  1.0  |  13  |
| U938994110 | I468654789 |  1.0  |  14  |
| U938994110 | I469015649 |  1.0  |  15  |
| U938994110 | I765041785 |  1.0  |  16  |
| U938994110 | I631190904 |  1.0  |  17  |
| U938994110 | I391372257 |  1.0  |  18  |
| U938994110 | I565589505 |  1.0  |  19  |
| U938994110 | I347182602 |  1.0  |  20  |
| U93899411

In [57]:
# Cosine similarity model
n_rec = 100 # number of items to recommend
n_display = 100 # to display the first few rows in an output dataset
name = 'cosine'
target = 'scaled_purchase_freq'
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+------------+----------------------+------+
| ReviewerID |   ItemID   |        score         | rank |
+------------+------------+----------------------+------+
| U938994110 | I523841570 | 0.13608276844024658  |  1   |
| U938994110 | I784962548 |  0.1111111044883728  |  2   |
| U938994110 | I087170658 | 0.08333333333333333  |  3   |
| U938994110 | I067562933 |  0.0785674254099528  |  4   |
| U938994110 | I431497617 |  0.0785674254099528  |  5   |
| U938994110 | I231543290 |  0.0785674254099528  |  6   |
| U938994110 | I711587757 |  0.0785674254099528  |  7   |
| U938994110 | I815941427 |  0.0785674254099528  |  8   |
| U938994110 | I919863519 | 0.06804138422012329  |  9   |
| U938994110 | I779531888 | 0.06299408276875813  |  10  |
| U938994110 | I622918777 | 0.060858070850372314 |  11  |
| U938994110 | I423313696 | 0.05555556217829386  |  12  |
| U938994110 | I710137751 | 0.05555556217829386  |  13  |
| U938994110 | I093496736 | 0.05555556217829386  |  14  |
| U938994110 |

In [31]:
# Pearson similarity model

name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+------------+-------+------+
| ReviewerID |   ItemID   | score | rank |
+------------+------------+-------+------+
| U938994110 | I644212601 |  0.0  |  1   |
| U938994110 | I131510648 |  0.0  |  2   |
| U938994110 | I701458428 |  0.0  |  3   |
| U938994110 | I618465544 |  0.0  |  4   |
| U938994110 | I280047268 |  0.0  |  5   |
| U938994110 | I191422181 |  0.0  |  6   |
| U938994110 | I874084019 |  0.0  |  7   |
| U938994110 | I936227431 |  0.0  |  8   |
| U938994110 | I696744431 |  0.0  |  9   |
| U938994110 | I188783781 |  0.0  |  10  |
| U938994110 | I732304083 |  0.0  |  11  |
| U938994110 | I496133188 |  0.0  |  12  |
| U938994110 | I889608598 |  0.0  |  13  |
| U938994110 | I156499362 |  0.0  |  14  |
| U938994110 | I479651031 |  0.0  |  15  |
| U938994110 | I774764129 |  0.0  |  16  |
| U938994110 | I150073598 |  0.0  |  17  |
| U938994110 | I974018949 |  0.0  |  18  |
| U938994110 | I197972217 |  0.0  |  19  |
| U938994110 | I121097836 |  0.0  |  20  |
| U93899411

The Pearson coefficient model doesn't seem to be working, but the cosine similarity model is perhaps promising.

### Model Evaluation

We will use our test data to evaluate RMSE, Recall, and Precision

In [32]:
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [33]:
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Scaled Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.0002406545804588474  | 0.00013035456441520945 |
|   2    | 0.00016043638697256537 | 0.0002105727579014921  |
|   3    | 0.00016043638697256537 | 0.00033090004813091616 |
|   4    | 0.00012032729022942399 |  0.000330900048130916  |
|   5    |  9.62618321835391e-05  | 0.0003309000481309164  |
|   6    | 8.021819348628259e-05  | 0.0003309000481309162  |
|   7    | 6.875845155967101e-05  | 0.0003309000481309162  |
|   8    | 8.021819348628272e-05  | 0.00045122733836034106 |
|   9    | 7.130506087669572e-05  | 0.0004512273383603407  |
|   10   | 8.021819348628279e-05  |  0.00054035866445621   |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
|


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0028076367720198917 |  0.001579449541907428 |
|   2    | 0.0021658912241296346 | 0.0023960622629000663 |
|   3    | 0.0019252366436707843 |  0.003261378887080795 |
|   4    | 0.0018249639018129305 |  0.004126398406841206 |
|   5    | 0.0017327129793037036 |  0.004937939130944096 |
|   6    | 0.0016578426653831783 |  0.005679957420692218 |
|   7    | 0.0015814443858724288 |  0.006231086120385007 |
|   8    |  0.001484036579496231 |  0.006692340732931139 |
|   9    | 0.0014171880849243254 |  0.007372285420576775 |
|   10   |  0.001419862024707201 |  0.008186500084462537 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9972773192274832

Per User RMSE (best)
+------------+---------------------+-------+


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |  4.01090967431413e-05  |  8.02181934862826e-05  |
|   3    | 2.673939782876088e-05  | 8.021819348628268e-05  |
|   4    | 4.0109096743141294e-05 | 0.00010027274185785332 |
|   5    | 4.813091609176967e-05  | 0.00018049093534413605 |
|   6    | 4.010909674314128e-05  | 0.00018049093534413627 |
|   7    | 5.729870963305911e-05  | 0.00028744852665917953 |
|   8    | 8.021819348628283e-05  | 0.0004345152147173647  |
|   9    | 9.804445870545652e-05  | 0.0005548425049467881  |
|   10   | 8.824001283491107e-05  | 0.0005548425049467881  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
|

The cosine model is most promising, so this is the one we will output.

### Output recomendations

In [34]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['ReviewerID', 'recommendedProducts']].drop_duplicates() \
        .sort_values('ReviewerID').set_index('ReviewerID')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [56]:
n_rec = 500
df_output = create_output(cos_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(19871, 1)


Unnamed: 0_level_0,recommendedProducts
ReviewerID,Unnamed: 1_level_1
U000005569,I719317451|I012888581|I762528631|I583032838|I5...
U000089279,I253550853|I757860157|I514848204|I085366280|I5...
U000132800,I506779656|I095216173|I244225428|I239620091|I4...
U000198945,I993511079|I784478206|I165988717|I114789956|I6...
U000243198,I154965338|I727650189|I964796229|I053010930|I3...


### Recommendation Function

In [36]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [38]:
users_to_recommend[0:10]

['U938994110',
 'U181459539',
 'U941668816',
 'U768449391',
 'U640450168',
 'U087574132',
 'U885457860',
 'U319023404',
 'U535965656',
 'U883645154']

In [41]:
customer_recomendation('U941668816')

recommendedProducts    I570024894|I838334135|I894561336|I265097718|I1...
Name: U941668816, dtype: object

### Merge recommendations with test set

In [43]:
predict_merge = pd.merge(test_df, df_output, how = 'left',
                         on = 'ReviewerID')
predict_merge

Unnamed: 0,ReviewerID,ItemID,recommendedProducts
0,U938994110,I529819131,I523841570|I784962548|I087170658|I067562933|I8...
1,U181459539,I863471064,I027568869|I789773992|I083820442|I858440673|I4...
2,U941668816,I684585522,I570024894|I838334135|I894561336|I265097718|I1...
3,U768449391,I782253949,I508226886|I875371733|I906119867|I637892892|I0...
4,U640450168,I232683472,I151628573|I977179834|I698468913|I025792869|I3...
...,...,...,...
27995,U337041888,I763827121,I420094292|I373149902|I626722636|I716738158|I5...
27996,U457455307,I242828364,I375062920|I793323176|I863622263|I055598794|I2...
27997,U052546714,I111529174,I144498143|I235916809|I337738261|I775251811|I0...
27998,U566804667,I857242737,I461350380|I389764281|I003808545|I777457632|I6...
