# MODEL


Based on the nature of the data and what we are trying to accomplish, creating a recommendation engine is the most appropriate solution.

First step is to import libraries needed:

In [345]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from data_processing import portfolio_transform, transcript_transform, profile_transform
import timeit
import pickle
from random import randrange
import operator

After importing libraries, read in the files:

In [2]:
# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

# TRANSFORMATION FUNCTIONS FROM EDA

In [3]:
year_curr = datetime.today().year

In [4]:
# clean data
portfolio = portfolio_transform(portfolio)
transcript = transcript_transform(transcript, portfolio)
profile = profile_transform(profile)

Preprocess data so that we can identify successful offer distribution with a target variable

In [5]:
# make a target variable column to update as we go
# this new column will identify if offer is effective
# will be 1 if meets criteria of a successful offer
transcript['target_var'] = np.nan

In [6]:
# test on one person
# sort by time and offer id for a single person
test = transcript.sort_values(['person', 'time'])[transcript['person'] == '78afa995795e4d85b5d9ceeca43f5fef'].reset_index()

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# get list of row values where the value is offer received
viewed_list = test[test['event'] == 'offer viewed'].index.to_list()
print(viewed_list)

# check other rows below this row
for index in viewed_list:
    # see how far down the dataframe to check for values
    # MAY NOT NEED THIS VARIABLE
    #length = len(test) - index
    
    # get list of index values to check
    row_check = list(range((index+1), len(test)))
    
    
    for row in row_check:
        if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
            test.loc[row, 'target_var'] = 1

[1, 6, 11, 17]


In [8]:
# find people to test this on. want people with a high number of viewed offers
transcript[transcript['event'] == 'offer viewed']['person'].value_counts().head()

5dcea5d448b34b1d99ee161cc76b51aa    6
fe1122a36f5e462f82d489c65609408b    6
e3ae6c6c555e404f9f4172b1efd3017f    6
2ada553b48184508bad9b4bca1ff13f5    6
4018381562d645bca291b2ea87413258    6
Name: person, dtype: int64

In [9]:
# try test on a person who completed an offer without viewing it. want to weed these out

person = '9cfa8a152539446b8384a215eb7db2fe'    
test = transcript.sort_values(['person', 'time'])[transcript['person'] == person].reset_index()

viewed_list = test[test['event'] == 'offer viewed'].index.to_list()
#print(viewed_list)


for index in viewed_list:
    row_check = list(range((index+1), len(test)))
    for row in row_check:
        if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
            test.loc[row, 'target_var'] = 1
            break
test

  after removing the cwd from sys.path.


Unnamed: 0,index,person,event,time,amount,reward_x,offer_id,reward_y,difficulty,duration,bogo,discount,informational,email,mobile,social,web,target_var
0,1208,9cfa8a152539446b8384a215eb7db2fe,offer received,0,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
1,18292,9cfa8a152539446b8384a215eb7db2fe,offer viewed,12,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
2,54379,9cfa8a152539446b8384a215eb7db2fe,offer received,168,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
3,77425,9cfa8a152539446b8384a215eb7db2fe,offer viewed,192,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
4,96494,9cfa8a152539446b8384a215eb7db2fe,transaction,258,24.24,,,,,,,,,,,,,
5,97808,9cfa8a152539446b8384a215eb7db2fe,transaction,264,19.74,,,,,,,,,,,,,
6,106011,9cfa8a152539446b8384a215eb7db2fe,transaction,306,11.95,,,,,,,,,,,,,
7,112032,9cfa8a152539446b8384a215eb7db2fe,offer received,336,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,240.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,
8,123903,9cfa8a152539446b8384a215eb7db2fe,offer viewed,336,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,240.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,
9,145552,9cfa8a152539446b8384a215eb7db2fe,transaction,390,26.37,,,,,,,,,,,,,


Looking at the slice of the dataframe below, we can see that only row 19 was marked as the target variable. This is because the person viewed the offer before completing the offer. 

Another scenario is played out in row 23. Although this row has a 'offer completed' value, it was not marked as effective in the target_Var column. This is because the person did not view the offer before completing it. An effective offer follows the flow of:
- 'offer received' ---> 'offer viewed' ---> 'transaction' (one or multiple) ---> 'offer completed'

Any other flow is not considered successful in this situation

In [10]:
# example
test.loc[16:26,['person', 'event', 'time', 'offer_id', 'target_var']]

Unnamed: 0,person,event,time,offer_id,target_var
16,9cfa8a152539446b8384a215eb7db2fe,offer received,504,2298d6c36e964ae4a3e7e9706d1fb8c2,
17,9cfa8a152539446b8384a215eb7db2fe,offer viewed,504,2298d6c36e964ae4a3e7e9706d1fb8c2,
18,9cfa8a152539446b8384a215eb7db2fe,transaction,510,,
19,9cfa8a152539446b8384a215eb7db2fe,offer completed,510,2298d6c36e964ae4a3e7e9706d1fb8c2,1.0
20,9cfa8a152539446b8384a215eb7db2fe,transaction,534,,
21,9cfa8a152539446b8384a215eb7db2fe,offer received,576,2906b810c7d4411798c6938adc9daaa5,
22,9cfa8a152539446b8384a215eb7db2fe,transaction,588,,
23,9cfa8a152539446b8384a215eb7db2fe,offer completed,588,2906b810c7d4411798c6938adc9daaa5,
24,9cfa8a152539446b8384a215eb7db2fe,offer viewed,594,2906b810c7d4411798c6938adc9daaa5,
25,9cfa8a152539446b8384a215eb7db2fe,transaction,642,,


Now we can do this for all rows in the dataset. There are 17,000 users so this can take awhile, so will time the execution to see how long this takes

In [11]:
# first get user list
user_list = transcript['person'].value_counts().index.to_list()

In [None]:
start = timeit.default_timer()

# create an empty dataframe to append to when running through all users' data to identify
# which offers are effective
new_transcript_big = pd.DataFrame()

# perform effectiveness logic
for user in user_list:
    test = transcript.sort_values(['person', 'time'])[transcript['person'] == user].reset_index()
    test['target_var'] = np.nan

    viewed_list = test[test['event'] == 'offer viewed'].index.to_list()


    for index in viewed_list:
        row_check = list(range((index+1), len(test)))
        for row in row_check:
            if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
                test.loc[row, 'target_var'] = 1
                break
    new_transcript_big = pd.concat([new_transcript_big, test], ignore_index=True)
    

new_transcript_big.drop(columns='index', inplace = True)
new_transcript_big['target_var'].fillna(0, inplace = True)

    
stop = timeit.default_timer()

print('Time: ', stop - start)
# took about 200  minutes (1200 seconds) on last attempt

Join with the Profile dataset and save to CSV

In [None]:
new_transcript_big = pd.concat([new_transcript_big, profile], axis = 1)

In [None]:
new_transcript_big.to_csv('transcript_large', index = False)

#### Now, try this without the transaction rows to see how long it takes

In [12]:
start = timeit.default_timer()

# create an empty dataframe to append to when running through all users' data to identify
# which offers are effective
new_transcript = pd.DataFrame()

# cut down on number of rows for processing
# can do this assuming that getting an event = offer complete means that the system
# automatically judged eligibility upon data collection
transcript_lite = transcript[transcript['event'] != 'transaction']

# perform effectiveness logic
for user in user_list:
    test = transcript_lite.sort_values(['person', 'time'])[transcript_lite['person'] == user].reset_index()
    test['target_var'] = np.nan

    viewed_list = test[test['event'] == 'offer viewed'].index.to_list()


    for index in viewed_list:
        row_check = list(range((index+1), len(test)))
        for row in row_check:
            if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
                test.loc[row, 'target_var'] = 1
                break
    new_transcript = pd.concat([new_transcript, test], ignore_index=True)
    

new_transcript.drop(columns='index', inplace = True)
new_transcript['target_var'].fillna(0, inplace = True)

stop = timeit.default_timer()

print('Time: ', stop - start)
# took about 114 (6840 seconds)minutes on last attempt

  


Time:  6451.9553717


Join with the Profile dataset and save to CSV

In [13]:
new_transcript = pd.concat([new_transcript, profile], axis = 1)

In [14]:
new_transcript.to_csv('new_transcript', index = False)

#### Test transformation:

In [15]:
# verify that new dataframe matches original dataframes length
if len(transcript_lite) == len(new_transcript):
    print('Pass')
else:
    print("Lengths aren't the same, check again")

Pass


In [16]:
# what are the proportion of positive and negative targets in entire dataset
print('Proportion of Positive Targets: ' + str(new_transcript['target_var'].value_counts().values[0] / len(new_transcript)))
print('Proportion of Negative Targets: ' + str((len(new_transcript) - new_transcript['target_var'].value_counts().values[0])/len(new_transcript)))

Proportion of Positive Targets: 0.8581581444197135
Proportion of Negative Targets: 0.14184185558028656


In [17]:
# proportions of 1's and 0's in target variable when event = offer completed?
# this represents the proportion of people who actually saw the offer and made transactions after seeing it
# this could indicate an effective offer
new_transcript[new_transcript['event'] == 'offer completed']
print('Proportion of Positive Targets: ' + str(new_transcript[new_transcript['event'] == 'offer completed']['target_var'].value_counts().values[0] \
                                               / len(new_transcript[new_transcript['event'] == 'offer completed'])))
print('Proportion of Negative Targets: ' + str((len(new_transcript[new_transcript['event'] == 'offer completed']) - \
                                                new_transcript[new_transcript['event'] == 'offer completed']\
                                                ['target_var'].value_counts().values[0])/len(new_transcript[new_transcript['event'] == 'offer completed'])))



Proportion of Positive Targets: 0.7078829030048542
Proportion of Negative Targets: 0.29211709699514576


#### This demonstrates that there is an imbalanced class distribtution for the target variable. This may need to be taken into consideration prior to building the model

# BUILD RECOMMENDATION ENGINE

1) Create a user-item matrix

2) Choose number of latent features to use

3) Split into training and test sets

4) Train Model

5) Predict and Assess

6) Try to recommenda coupon for a particular user

In [18]:
# if user wants to just import data instead of running steps above, uncomment below to import data
new_transcript = pd.read_csv('new_transcript')

  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
# create user item matrix
def create_user_item_matrix(df):
    '''
    INPUT:
    df - customer transaction log
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    # user item matrix based on criteria outlined in the markdown cell above
    # code from mentor: https://knowledge.udacity.com/questions/140813, made own notes to
    # understand each line
    
    # get count of interactions for each user and article
    df_new = df.groupby(['offer_id', 'person']).count().reset_index()
    # use pivot to create a dataframe with counts for each corresponding pair
    df_new=df_new.pivot_table(index='person',columns='offer_id',values='target_var')
    # replace NaNs with 0's
    df_new = df_new.replace(np.nan, 0)
    # the counts include values greater than 1. change values greater than 1 to 1
    #user_item=df_new.applymap(lambda x: 1 if x > 0 else x)

    df_new_np = np.matrix(df_new)
    
    return df_new, df_new_np

In [20]:
def FunkSVD(score_mat, latent_features = 4, learning_rate = .0001, iters = 100):
    '''
    INPUT:
    
    score_mat - (numpy array) matrix with users as rows, coupon offers as columns, and interactions as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate 
    iters - (int) the number of iterations
    
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by portfolio matrix
    
    '''
    # set up values to be used throughout function
    n_users = score_mat.shape[0]
    n_offers = score_mat.shape[1]
    # can do n_users * n_offers, but issues if have blanks
    num_ratings = np.count_nonzero(~np.isnan(score_mat))
    
    #initialize user and portfolio matricies with random variables
    # user matrix is filled with random values of shape (user x latent_features)
    user_mat = np.random.rand(n_users, latent_features)
    portfolio_mat = np.random.rand(latent_features, n_users)
    
    # initialize sse at 0 to keep track of errors
    sse_accum = 0
    
    # keep track of the iteration number and mean squared error
    print('Optimization Statistics')
    print('Iterations | Mean Squared Error')
        
    # compute error for each iteration
    for iteration in range(iters):
        old_sse = sse_accum
        sse_accum = 0
        
        # for each user-movie pair, we will do the following:
        # if a rating score exists, then compute error
        for i in range(n_users):
            for j in range(n_offers):
                # check if score is greater than 0, which indicates user liked the offer
                if score_mat[i,j] > 0:
                    
                    # compute error
                    diff = score_mat[i, j] - np.dot(user_mat[i, :], portfolio_mat[:, j])
                    
                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*portfolio_mat[k, j])
                        portfolio_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])
                    
        # print results for iteration
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
    
    return user_mat, portfolio_mat

In [33]:
# fit FunkSVD using training data

# first create user_item matrix
user_item, user_item_np = create_user_item_matrix(new_transcript)

# next, split data into training and test sets
train_user_item, test_user_item = train_test_split(user_item)
train_user_item_np = np.matrix(train_user_item)
test_user_item_np = np.matrix(test_user_item)

# then run FunkSVD (default latent features = 4)
user_mat, portfolio_mat = FunkSVD(train_user_item_np, learning_rate = .005, iters = 100)

# use result of FunkSVD function to predict any user item combo
# do this by dot product of a row in user matrix by a column in portfolio matrix

Optimization Statistics
Iterations | Mean Squared Error
1 		 0.755833
2 		 0.613713
3 		 0.529479
4 		 0.478152
5 		 0.446982
6 		 0.427764
7 		 0.415474
8 		 0.407073
9 		 0.400681
10 		 0.395074
11 		 0.389374
12 		 0.382881
13 		 0.374987
14 		 0.365155
15 		 0.352964
16 		 0.338200
17 		 0.320953
18 		 0.301685
19 		 0.281178
20 		 0.260373
21 		 0.240149
22 		 0.221152
23 		 0.203740
24 		 0.188017
25 		 0.173918
26 		 0.161302
27 		 0.150001
28 		 0.139861
29 		 0.130752
30 		 0.122565
31 		 0.115212
32 		 0.108616
33 		 0.102709
34 		 0.097424
35 		 0.092695
36 		 0.088462
37 		 0.084662
38 		 0.081242
39 		 0.078150
40 		 0.075341
41 		 0.072777
42 		 0.070425
43 		 0.068257
44 		 0.066249
45 		 0.064383
46 		 0.062642
47 		 0.061015
48 		 0.059490
49 		 0.058058
50 		 0.056712
51 		 0.055447
52 		 0.054255
53 		 0.053134
54 		 0.052077
55 		 0.051083
56 		 0.050146
57 		 0.049264
58 		 0.048434
59 		 0.047652
60 		 0.046917
61 		 0.046225
62 		 0.045574
63 		 0.044962
64 		 0.

In [34]:
# verify NaN values don't break the Funk SVD function
verify = user_item_np
test[0, 0] = np.nan
user_mat_verify, portfolio_mat_test_verify = FunkSVD(verify, learning_rate = .005, iters = 5)

Optimization Statistics
Iterations | Mean Squared Error
1 		 0.750968
2 		 0.609596
3 		 0.525292
4 		 0.474081
5 		 0.443145


In [266]:
# create function to predict any user-item combo
def predict_score(user_mat, portfolio_mat, user_id, offer_id):
    '''
    INPUT:
    user_mat - matrix with users as rows and latent factor as column
    portfolio_mat - matrix with latent factor as rows and offer as column
    user_id - specific user_id from the fact dataframe
    offer_id - offer_id according to offer dataframe
        
    OUTPUT:
    pred - predicted interaction with offer
    
    '''
    
    user_ids_series = np.array(user_item.index)
    portfolio_ids_series = np.array(user_item.columns)
    
    #index into the user and portfolio matricies to retrieve user ID and offer ID
    portfolio_col = np.where(portfolio_ids_series == offer_id)[0][0]
    user_row = np.where(user_ids_series == user_id)[0][0]

    # Take dot product of that row and column in U and V to make prediction
    pred = np.dot(user_mat[user_row, :], portfolio_mat[:, portfolio_col])
    
    return pred

In [340]:
# Test function with user offer pair to compare pred and actual

# get a random index from the list to select customer and coupon
user_index = randrange(0, len(test_user_item.index)-1)
user = user_item.iloc[user_index-1:user_index,].index[0]
offer_index = randrange(0, len(test_user_item.columns)-1)
offer = user_item.iloc[:,[offer_index]].columns[0]

# print results
print('User: ', user)
print('Offer: ',offer)
print('Actual Value: ', user_item.loc[user,offer])
print('Predicted Value: ')
predict_score(user_mat, portfolio_mat, user, offer)

User:  3eceb9642019459cbe7dbe9b169cda9d
Offer:  f19421c1d4aa40978ebb69ca19b0e20d
Actual Value:  0.0
Predicted Value: 


1.4571699204914932

In [343]:
all_offers

['0b1e1539f2cc45b7b9fa7c272da2e1d7',
 '2298d6c36e964ae4a3e7e9706d1fb8c2',
 '2906b810c7d4411798c6938adc9daaa5',
 '3f207df678b143eea3cee63160fa8bed',
 '4d5c57ea9a6940dd891ad53e9dbe8da0',
 '5a8bc65990b245e5a138643cd4eb9837',
 '9b98b8c7a33c4b65b9aebfe6a799e6d9',
 'ae264e3637204a6fb9bb56bc8210ddfd',
 'f19421c1d4aa40978ebb69ca19b0e20d',
 'fafdcd668e3743c1bb461111dcafc2a4']

In [437]:
# select highest score (max function)

# store all offers as a list go through all offers
all_offers = user_item.columns.to_list()

# choose a user at random
user_index = randrange(0, len(test_user_item.index)-1)
user_pred = user_item.iloc[user_index-1:user_index,].index[0]

# predict score for all offers for the user
# use dictionary to store what offer index to use when selecting largest predicted score
offer_dict = {}
for index, offer in enumerate(all_offers):
    score = predict_score(user_mat, portfolio_mat, user_pred, offer) 
    offer_dict[index] = score

# index of largest/best predicted score
col_index = max(offer_dict.items(), key=operator.itemgetter(1))[0]
best_score = max(offer_dict.items(), key=operator.itemgetter(1))[1]


#print(sorted(offer_dict.values(), reverse = True))


target_user = user_item.index.to_list()[user_index]
target_offer = user_item.columns.to_list()[col_index]

# now see predicted vs actual
#print('Actual value in user item matrix = ', user_item.loc[target_user, target_offer])
#print('Predicted value in user item matrix = ', best_score)
print('The best offer for user' , target_user, 'is', target_offer)

# can us the "target_offer" variable in other apps

The best offer for user 3e1bceba9b6e4dd2964bc101a58baec9 is ae264e3637204a6fb9bb56bc8210ddfd
