# MODEL


Based on the nature of the data and what we are trying to accomplish, creating a recommendation engine is the most appropriate solution.

First step is to import libraries needed:

In [2]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from data_processing import portfolio_transform, transcript_transform, profile_transform
import timeit
import pickle


  import pandas.util.testing as tm


After importing libraries, read in the files:

In [2]:
# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

# TRANSFORMATION FUNCTIONS FROM EDA

In [3]:
year_curr = datetime.today().year

In [4]:
# clean data
portfolio = portfolio_transform(portfolio)
transcript = transcript_transform(transcript, portfolio)
profile = profile_transform(profile)

Preprocess data so that we can identify successful offer distribution with a target variable

In [5]:
# make a target variable column to update as we go
# this new column will identify if offer is effective
# will be 1 if meets criteria of a successful offer
transcript['target_var'] = np.nan

In [6]:
# test on one person
# sort by time and offer id for a single person
test = transcript.sort_values(['person', 'time'])[transcript['person'] == '78afa995795e4d85b5d9ceeca43f5fef'].reset_index()

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# get list of row values where the value is offer received
viewed_list = test[test['event'] == 'offer viewed'].index.to_list()
print(viewed_list)

# check other rows below this row
for index in viewed_list:
    # see how far down the dataframe to check for values
    # MAY NOT NEED THIS VARIABLE
    #length = len(test) - index
    
    # get list of index values to check
    row_check = list(range((index+1), len(test)))
    
    
    for row in row_check:
        if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
            test.loc[row, 'target_var'] = 1

[1, 6, 11, 17]


In [8]:
# find people to test this on. want people with a high number of viewed offers
transcript[transcript['event'] == 'offer viewed']['person'].value_counts().head()

48eb3f18d6b5411d90b7dee36741979e    6
30157b7fc37f43f2ae7ccd9edfe0b672    6
6fd0fff2a0df4c8da26b5658d220a96a    6
24e1795cb9894e078644210adf514d18    6
cbf24f9d89e546c7872f28326a7821d3    6
Name: person, dtype: int64

In [9]:
# try test on a person who completed an offer without viewing it. want to weed these out

person = '9cfa8a152539446b8384a215eb7db2fe'    
test = transcript.sort_values(['person', 'time'])[transcript['person'] == person].reset_index()

viewed_list = test[test['event'] == 'offer viewed'].index.to_list()
#print(viewed_list)


for index in viewed_list:
    row_check = list(range((index+1), len(test)))
    for row in row_check:
        if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
            test.loc[row, 'target_var'] = 1
            break
test

  after removing the cwd from sys.path.


Unnamed: 0,index,person,event,time,amount,reward_x,offer_id,reward_y,difficulty,duration,bogo,discount,informational,email,mobile,social,web,target_var
0,1208,9cfa8a152539446b8384a215eb7db2fe,offer received,0,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
1,18292,9cfa8a152539446b8384a215eb7db2fe,offer viewed,12,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
2,54379,9cfa8a152539446b8384a215eb7db2fe,offer received,168,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
3,77425,9cfa8a152539446b8384a215eb7db2fe,offer viewed,192,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,96.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,
4,96494,9cfa8a152539446b8384a215eb7db2fe,transaction,258,24.24,,,,,,,,,,,,,
5,97808,9cfa8a152539446b8384a215eb7db2fe,transaction,264,19.74,,,,,,,,,,,,,
6,106011,9cfa8a152539446b8384a215eb7db2fe,transaction,306,11.95,,,,,,,,,,,,,
7,112032,9cfa8a152539446b8384a215eb7db2fe,offer received,336,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,240.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,
8,123903,9cfa8a152539446b8384a215eb7db2fe,offer viewed,336,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,240.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,
9,145552,9cfa8a152539446b8384a215eb7db2fe,transaction,390,26.37,,,,,,,,,,,,,


Looking at the slice of the dataframe below, we can see that only row 19 was marked as the target variable. This is because the person viewed the offer before completing the offer. 

Another scenario is played out in row 23. Although this row has a 'offer completed' value, it was not marked as effective in the target_Var column. This is because the person did not view the offer before completing it. An effective offer follows the flow of:
- 'offer received' ---> 'offer viewed' ---> 'transaction' (one or multiple) ---> 'offer completed'

Any other flow is not considered successful in this situation

In [10]:
# example
test.loc[16:26,['person', 'event', 'time', 'offer_id', 'target_var']]

Unnamed: 0,person,event,time,offer_id,target_var
16,9cfa8a152539446b8384a215eb7db2fe,offer received,504,2298d6c36e964ae4a3e7e9706d1fb8c2,
17,9cfa8a152539446b8384a215eb7db2fe,offer viewed,504,2298d6c36e964ae4a3e7e9706d1fb8c2,
18,9cfa8a152539446b8384a215eb7db2fe,transaction,510,,
19,9cfa8a152539446b8384a215eb7db2fe,offer completed,510,2298d6c36e964ae4a3e7e9706d1fb8c2,1.0
20,9cfa8a152539446b8384a215eb7db2fe,transaction,534,,
21,9cfa8a152539446b8384a215eb7db2fe,offer received,576,2906b810c7d4411798c6938adc9daaa5,
22,9cfa8a152539446b8384a215eb7db2fe,transaction,588,,
23,9cfa8a152539446b8384a215eb7db2fe,offer completed,588,2906b810c7d4411798c6938adc9daaa5,
24,9cfa8a152539446b8384a215eb7db2fe,offer viewed,594,2906b810c7d4411798c6938adc9daaa5,
25,9cfa8a152539446b8384a215eb7db2fe,transaction,642,,


Now we can do this for all rows in the dataset. There are 17,000 users so this can take awhile, so will time the execution to see how long this takes

In [11]:
# first get user list
user_list = transcript['person'].value_counts().index.to_list()

In [12]:
start = timeit.default_timer()

# create an empty dataframe to append to when running through all users' data to identify
# which offers are effective
new_transcript_big = pd.DataFrame()

# perform effectiveness logic
for user in user_list:
    test = transcript.sort_values(['person', 'time'])[transcript['person'] == user].reset_index()
    test['target_var'] = np.nan

    viewed_list = test[test['event'] == 'offer viewed'].index.to_list()


    for index in viewed_list:
        row_check = list(range((index+1), len(test)))
        for row in row_check:
            if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
                test.loc[row, 'target_var'] = 1
                break
    new_transcript_big = pd.concat([new_transcript_big, test], ignore_index=True)
    

new_transcript_big.drop(columns='index', inplace = True)
new_transcript_big['target_var'].fillna(0, inplace = True)

    
stop = timeit.default_timer()

print('Time: ', stop - start)
# took about 200  minutes (1200 seconds) on last attempt

  if __name__ == '__main__':


Time:  12087.382896199999


Join with the Profile dataset and save to CSV

In [13]:
new_transcript_big = pd.concat([new_transcript_big, profile], axis = 1)

In [14]:
new_transcript_big.to_csv('transcript_large', index = False)

#### Now, try this without the transaction rows to see how long it takes

In [15]:
start = timeit.default_timer()

# create an empty dataframe to append to when running through all users' data to identify
# which offers are effective
new_transcript = pd.DataFrame()

# cut down on number of rows for processing
# can do this assuming that getting an event = offer complete means that the system
# automatically judged eligibility upon data collection
transcript_lite = transcript[transcript['event'] != 'transaction']

# perform effectiveness logic
for user in user_list:
    test = transcript_lite.sort_values(['person', 'time'])[transcript_lite['person'] == user].reset_index()
    test['target_var'] = np.nan

    viewed_list = test[test['event'] == 'offer viewed'].index.to_list()


    for index in viewed_list:
        row_check = list(range((index+1), len(test)))
        for row in row_check:
            if (test.loc[index, 'offer_id'] == test.loc[row,'offer_id']) & (test.loc[row,'event'] == 'offer completed'):
                test.loc[row, 'target_var'] = 1
                break
    new_transcript = pd.concat([new_transcript, test], ignore_index=True)
    

new_transcript.drop(columns='index', inplace = True)
new_transcript['target_var'].fillna(0, inplace = True)

stop = timeit.default_timer()

print('Time: ', stop - start)
# took about 114 (6840 seconds)minutes on last attempt

  


Time:  5940.3534783


Join with the Profile dataset and save to CSV

In [16]:
new_transcript = pd.concat([new_transcript, profile], axis = 1)

In [17]:
new_transcript.to_csv('new_transcript', index = False)

#### Test transformation:

In [18]:
# verify that new dataframe matches original dataframes length
if len(transcript_lite) == len(new_transcript):
    print('Pass')
else:
    print("Lengths aren't the same, check again")

Pass


In [19]:
# what are the proportion of positive and negative targets in entire dataset
print('Proportion of Positive Targets: ' + str(new_transcript['target_var'].value_counts().values[0] / len(new_transcript)))
print('Proportion of Negative Targets: ' + str((len(new_transcript) - new_transcript['target_var'].value_counts().values[0])/len(new_transcript)))

Proportion of Positive Targets: 0.8581581444197135
Proportion of Negative Targets: 0.14184185558028656


In [20]:
# proportions of 1's and 0's in target variable when event = offer completed?
# this represents the proportion of people who actually saw the offer and made transactions after seeing it
# this could indicate an effective offer
new_transcript[new_transcript['event'] == 'offer completed']
print('Proportion of Positive Targets: ' + str(new_transcript[new_transcript['event'] == 'offer completed']['target_var'].value_counts().values[0] \
                                               / len(new_transcript[new_transcript['event'] == 'offer completed'])))
print('Proportion of Negative Targets: ' + str((len(new_transcript[new_transcript['event'] == 'offer completed']) - \
                                                new_transcript[new_transcript['event'] == 'offer completed']\
                                                ['target_var'].value_counts().values[0])/len(new_transcript[new_transcript['event'] == 'offer completed'])))



Proportion of Positive Targets: 0.7078829030048542
Proportion of Negative Targets: 0.29211709699514576


#### This demonstrates that there is an imbalanced class distribtution for the target variable. This may need to be taken into consideration prior to building the model

# BUILD RECOMMENDATION ENGINE

1) Create a user-item matrix

2) Choose number of latent features to use

3) Split into training and test sets

4) Train Model

5) Predict and Assess

6) Try to recommenda coupon for a particular user

In [22]:
# if user wants to just import data instead of running steps above, uncomment below to import data
#new_transcript = pd.read_csv('new_transcript')

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,person,event,time,amount,reward_x,offer_id,reward_y,difficulty,duration,bogo,...,web,target_var,gender,age,id,became_member_on,income,Age_Gen,new_became_member_on,member_len
0,94de646f7b6041228ca7dec82adb97d2,offer received,0,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,120.0,1.0,...,1.0,0.0,,,68be06ca386d4c31939f3a4f0e3dd783,20170212.0,,,2017-02-12,1422.0
1,94de646f7b6041228ca7dec82adb97d2,offer viewed,6,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,120.0,1.0,...,1.0,0.0,F,55.0,0610b486422d4921ae7d2bf64640c50b,20170715.0,112000.0,Generation X,2017-07-15,1269.0
2,94de646f7b6041228ca7dec82adb97d2,offer completed,30,,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,120.0,1.0,...,1.0,1.0,,,38fe809add3b4fcf9315a9694bb96ff5,20180712.0,,,2018-07-12,907.0
3,94de646f7b6041228ca7dec82adb97d2,offer received,168,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,168.0,1.0,...,1.0,0.0,F,75.0,78afa995795e4d85b5d9ceeca43f5fef,20170509.0,100000.0,Boomers,2017-05-09,1336.0
4,94de646f7b6041228ca7dec82adb97d2,offer viewed,186,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,168.0,1.0,...,1.0,0.0,,,a03223e636434f42ac4c3df47e8bac43,20170804.0,,,2017-08-04,1249.0


In [26]:
def FunkSVD(score_mat, latent_features = 4, learning_rate = .0001, iters = 100):
    '''
    INPUT:
    
    score_mat - (numpy array) matrix with users as rows, coupon offers as columns, and interactions as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate 
    iters - (int) the number of iterations
    
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by portfolio matrix
    
    '''
    # set up values to be used throughout function
    n_users = score_mat.shape[0]
    n_offers = score_mat.shape[1]
    # can do n_users * n_offers, but issues if have blanks
    num_ratings = np.count_nonzero(~np.isnan(score_mat))
    
    #initialize user and portfolio matricies with random variables
    # user matrix is filled with random values of shape (user x latent_features)
    user_mat = np.random.rand(n_users, latent_features)
    portfolio_mat = np.random.rand(latent_features, n_users)
    
    # initialize sse at 0 to keep track of errors
    sse_accum = 0
    
    # keep track of the iteration number and mean squared error
    print('Optimization Statistics')
    print('Iterations | Mean Squared Error')
        
    # compute error for each iteration
    for iteration in range(iters):
        old_sse = sse_accum
        sse_accum = 0
        
        # for each user-movie pair, we will do the following:
        # if a rating score exists, then compute error
        for i in range(n_users):
            for j in range(n_offers):
                # check if score is greater than 0, which indicates user liked the offer
                if score_mat[i,j] > 0:
                    
                    # compute error
                    diff = score_mat[i, j] - np.dot(user_mat[i, :], portfolio_mat[:, j])
                    
                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*portfolio_mat[k, j])
                        portfolio_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])
                    
        # print results for iteration
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
    
    return user_mat, portfolio_mat

In [45]:
# create user item matrix
def create_user_item_matrix(df):
    '''
    INPUT:
    df - pandas dataframe with article_id, title, user_id columns
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    # user item matrix based on criteria outlined in the markdown cell above
    # code from mentor: https://knowledge.udacity.com/questions/140813, made own notes to
    # understand each line
    
    # get count of interactions for each user and article
    df_new = df.groupby(['offer_id', 'person']).count().reset_index()
    # use pivot to create a dataframe with counts for each corresponding pair
    df_new=df_new.pivot_table(index='person',columns='offer_id',values='target_var')
    # replace NaNs with 0's
    df_new = df_new.replace(np.nan, 0)
    # the counts include values greater than 1. change values greater than 1 to 1
    #user_item=df_new.applymap(lambda x: 1 if x > 0 else x)
    #return user_item # return the user_item matrix
    
    df_new_np = np.matrix(df_new)
    
    return df_new, df_new_np

In [65]:
# fit FunkSVD using training data

# first create user_item matrix for training data and test data
user_item, user_item_np = create_user_item_matrix(new_transcript)

# next, split data into training and test sets
train_user_item, test_user_item = train_test_split(user_item)
train_user_item_np = np.matrix(train_user_item)
test_user_item_np = np.matrix(test_user_item)

# then run FunkSVD (default latent features = 4)
user_mat_train, portfolio_mat_train = FunkSVD(train_user_item_np, learning_rate = .005, iters = 100)

# use result of FunkSVD function to prediction of any user item combo
# do this by dot product of a row in user matrix by a column in portfolio matrix

Optimization Statistics
Iterations | Mean Squared Error
1 		 0.740986
2 		 0.601980
3 		 0.520633
4 		 0.471175
5 		 0.441182
6 		 0.422744
7 		 0.411059
8 		 0.403240
9 		 0.397530
10 		 0.392811
11 		 0.388317
12 		 0.383462
13 		 0.377751
14 		 0.370735
15 		 0.362010
16 		 0.351257
17 		 0.338298
18 		 0.323164
19 		 0.306135
20 		 0.287716
21 		 0.268555
22 		 0.249321
23 		 0.230595
24 		 0.212805
25 		 0.196215
26 		 0.180948
27 		 0.167023
28 		 0.154395
29 		 0.142981
30 		 0.132685
31 		 0.123403
32 		 0.115034
33 		 0.107488
34 		 0.100680
35 		 0.094538
36 		 0.088996
37 		 0.083998
38 		 0.079492
39 		 0.075434
40 		 0.071781
41 		 0.068494
42 		 0.065539
43 		 0.062882
44 		 0.060492
45 		 0.058341
46 		 0.056402
47 		 0.054652
48 		 0.053070
49 		 0.051637
50 		 0.050336
51 		 0.049152
52 		 0.048072
53 		 0.047085
54 		 0.046181
55 		 0.045351
56 		 0.044587
57 		 0.043883
58 		 0.043232
59 		 0.042630
60 		 0.042072
61 		 0.041554
62 		 0.041073
63 		 0.040624
64 		 0.

In [66]:
# verify NaN values don't break the Funk SVD function
test = user_item_np
test[0, 0] = np.nan
user_mat_test, portfolio_mat_test = FunkSVD(test, learning_rate = .005, iters = 5)

Optimization Statistics
Iterations | Mean Squared Error
1 		 0.746768
2 		 0.605861
3 		 0.522418
4 		 0.472046
5 		 0.441806


In [187]:
# TODO:  
# create predict function with:
# input = user and portfolio item
# output = recommended coupon offer

''' first, index into the user and portfolio matricies to retrieve user ID and offer ID
# do this by using np.where

# second, 
'''
def predict_score(user_mat, portfolio_mat, user_id, offer_id):
    '''
    INPUT:
    user_mat - matrix with users as rows and latent factor as column
    portfolio_mat - matrix with latent factor as rows and offer as column
    user_id - specific user_id from the fact dataframe
    offer_id - offer_id according to offer dataframe
        
    OUTPUT:
    pred - predicted interaction with offer
    
    '''
    
    user_ids_series = np.array(train_user_item.index)
    portfolio_ids_series = np.array(train_user_item.columns)
    
    user_row = np.where(user_ids_series == user_item.index[user_id])[0][0]
    portfolio_col = np.where(portfolio_ids_series == user_item.columns[offer_id])[0][0]
 
    # Take dot product of that row and column in U and V to make prediction
    pred = np.dot(user_mat[user_row, :], portfolio_mat[:, portfolio_col])
    
    return pred


In [140]:
offer

'0b1e1539f2cc45b7b9fa7c272da2e1d7'

In [188]:
# Test function with user offer pair

# get a random index from the list
from random import randrange
user = randrange(0, len(user_item.index)-1)
offer = randrange(0, len(user_item.columns)-1)
print(user)
print(offer)
predict_score(user_mat, portfolio_mat, user, offer)

14574
5


2.5161178818507195

In [218]:
# TODO

# select highest score (max function)

# store all offers as a list go through all offers
all_offers = user_item.columns.to_list()

# choose a user at random
user_pred = randrange(0, len(user_item.index)-1)

# predict score for all offers for the user
# use dictionary to store what offer index to use when selecting largest predicted score
offer_dict = {}
for index, offer in enumerate(all_offers):
    score = predict_score(user_mat, portfolio_mat, user_pred, index) 
    offer_dict[index] = score

# index of largest/best predicted score
col_index = max(offer_dict.items(), key=operator.itemgetter(1))[0]
best_score = max(offer_dict.items(), key=operator.itemgetter(1))[1]

best_score

print(sorted(list(offer_dict.values()), reverse = True))


target_user = user_item.index.to_list()[user_pred]
target_column = user_item.columns.to_list()[col_index]



[3.759261702069482, 2.969991112702784, 2.2210988588142877, 1.9073700170114216, 1.661482507349894, 1.4634161447082112, 1.0475117938079155, -0.12228630608502189, -0.2197547483237723, -0.29658621583353967]
Actual value in user item matrix =  3.0
Predicted value in user item matrix =  3.759261702069482


In [221]:
# now see predicted vs actual
print('Actual value in user item matrix = ', user_item.loc[target_user, target_column])
print('Predicted value in user item matrix = ', best_score)
print('The best offer for user ' , target_user, ' is ', target_column)

Actual value in user item matrix =  3.0
Predicted value in user item matrix =  3.759261702069482
The best offer for user  b766eeeed5b64f778917e5872e8422ce  is  2298d6c36e964ae4a3e7e9706d1fb8c2


In [87]:
# TODO

In [232]:
new_transcript

Unnamed: 0,person,event,time,amount,reward_x,offer_id,reward_y,difficulty,duration,bogo,...,web,target_var,gender,age,id,became_member_on,income,Age_Gen,new_became_member_on,member_len
0,94de646f7b6041228ca7dec82adb97d2,offer received,0,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,120.0,1.0,...,1.0,0.0,,,68be06ca386d4c31939f3a4f0e3dd783,20170212.0,,,2017-02-12,1422.0
1,94de646f7b6041228ca7dec82adb97d2,offer viewed,6,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,120.0,1.0,...,1.0,0.0,F,55.0,0610b486422d4921ae7d2bf64640c50b,20170715.0,112000.0,Generation X,2017-07-15,1269.0
2,94de646f7b6041228ca7dec82adb97d2,offer completed,30,,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,120.0,1.0,...,1.0,1.0,,,38fe809add3b4fcf9315a9694bb96ff5,20180712.0,,,2018-07-12,907.0
3,94de646f7b6041228ca7dec82adb97d2,offer received,168,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,168.0,1.0,...,1.0,0.0,F,75.0,78afa995795e4d85b5d9ceeca43f5fef,20170509.0,100000.0,Boomers,2017-05-09,1336.0
4,94de646f7b6041228ca7dec82adb97d2,offer viewed,186,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,168.0,1.0,...,1.0,0.0,,,a03223e636434f42ac4c3df47e8bac43,20170804.0,,,2017-08-04,1249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167576,3045af4e98794a04a5542d3eac939b1f,offer viewed,576,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,120.0,1.0,...,1.0,0.0,,,,,,,,
167577,912b9f623b9e4b4eb99b6dc919f09a93,offer received,576,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,120.0,1.0,...,1.0,0.0,,,,,,,,
167578,912b9f623b9e4b4eb99b6dc919f09a93,offer viewed,594,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,120.0,1.0,...,1.0,0.0,,,,,,,,
167579,7ecfc592171f4844bdc05bdbb48d3847,offer received,336,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,72.0,0.0,...,0.0,0.0,,,,,,,,


## TESTING ABOVE
#### USER ITEM MATRIX

In [27]:
# create user item matrix from IBM Exercise
def create_user_item_matrix(df):
    '''
    INPUT:
    df - pandas dataframe with article_id, title, user_id columns
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    # user item matrix based on criteria outlined in the markdown cell above
    # code from mentor: https://knowledge.udacity.com/questions/140813, made own notes to
    # understand each line
    
    # get count of interactions for each user and article
    df_new = df.groupby(['offer_id', 'person']).count().reset_index()
    # use pivot to create a dataframe with counts for each corresponding pair
    df_new=df_new.pivot_table(index='person',columns='offer_id',values='target_var')
    # replace NaNs with 0's
    df_new = df_new.replace(np.nan, 0)
    # the counts include values greater than 1. change values greater than 1 to 1
    #user_item=df_new.applymap(lambda x: 1 if x > 0 else x)
    #return user_item # return the user_item matrix

user_item = create_user_item_matrix(new_transcript)

In [28]:
# the counts include values greater than 1. change values greater than 1 to 1
# this was done in the IBM project, but may not be applicable here?
user_item=df_new.applymap(lambda x: 1 if x > 0 else x)


NameError: name 'df_new' is not defined

In [None]:
# create user item matrix from IBM Exercise
def create_user_item_matrix(df):
    '''
    INPUT:
    df - pandas dataframe with article_id, title, user_id columns
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    # user item matrix based on criteria outlined in the markdown cell above
    # code from mentor: https://knowledge.udacity.com/questions/140813, made own notes to
    # understand each line
    
    # get count of interactions for each user and article
    df_new = df.groupby(['offer_id', 'person']).count().reset_index()
    # use pivot to create a dataframe with counts for each corresponding pair
    df_new=df_new.pivot_table(index='person',columns='offer_id',values='target_var')
    # replace NaNs with 0's
    df_new = df_new.replace(np.nan, 0)
    # the counts include values greater than 1. change values greater than 1 to 1
    #user_item=df_new.applymap(lambda x: 1 if x > 0 else x)
    #return user_item # return the user_item matrix

user_item = create_user_item_matrix(new_transcript)

#### Perform SVD on User Item Matrix

In [None]:
# Perform SVD on the User-Item Matrix Here
# get user, sigma, and v transpose matrices. use built in to get the three matrices 
u, s, vt = np.linalg.svd(user_item)

In [None]:
# test expected outcomes from matrix creation
print('Expected shape of u: ' + str(len(new_transcript['person'].value_counts())))
if len(u) == len(new_transcript['person'].value_counts()):
    print('Pass')
else:
    print('Result is now what was expected. Try again')

    
print('Expected shape of s: ' + str(len(new_transcript['offer_id'].value_counts())))
if len(s) == len(new_transcript['offer_id'].value_counts()):
    print('Pass')
else:
    print('Result is now what was expected. Try again')

print()
print(u.shape)
print(s.shape)
vt.shape

#### CHOOSE NUMBER OF LATENT FEATURES

In [None]:
num_latent_feats = np.arange(1,11)
num_latent_feats

In [None]:
num_latent_feats = np.arange(1,11)
sum_errs = []

for k in num_latent_feats:
    # restructure with k latent features
    s_new, u_new, vt_new = np.diag(s[:k]), u[:, :k], vt[:k, :]
    
    # take dot product
    user_item_est = np.around(np.dot(np.dot(u_new, s_new), vt_new))

    # compute error for each prediction to actual value
    diffs = np.subtract(user_item, user_item_est)
    
    # total errors and keep track of them
    err = np.sum(np.sum(np.abs(diffs)))
    sum_errs.append(err)   

In [None]:
plt.plot(num_latent_feats, 1 - np.array(sum_errs)/new_transcript.shape[0]);
plt.xlabel('Number of Latent Features');
plt.ylabel('Accuracy');
plt.title('Accuracy vs. Number of Latent Features');

#### SPLIT INTO TRAIN AND TEST

Testing code

In [None]:
def create_user_item_matrix(transcript, portfolio, filename):
    '''
    Return the user item matrix that indicate the number of offer complete of a particular user
    
    INPUT:
    offer - a cleaned transcript dataframe
    filename(string) - the file name that save the user item matrix
    
    OUTPUT:
    user_item_matrix - the user item matrix which 
        - row is user 
        - column is offer
        - value is the number of offer complete by the user (NaN means no offer given)
    
    '''
    # create an empty user item matrix
    user_item_matrix = transcript.groupby(['person', 'offer_id'])['event'].agg(lambda x: np.nan).unstack()
    # uncomment if want to focus on just bogo and discount
    # user_item_matrix.drop(list(portfolio[portfolio['offer_type']=='informational']['id']), axis=1, inplace=True)
    
    for offer_id in user_item_matrix.columns:
        print("Now processing: ", offer_id)
        num = 0
        for person in user_item_matrix.index:
            num += 1
            if num % 1000 == 0:
                print("Progress: ", round(num/16994*100, 2), '%')
            events = []
            for event in transcript[(transcript['offer_id']==offer_id) & (transcript['person']==person)]['event']:
                events.append(event)
            if len(events) >= 3:
                user_item_matrix.loc[person, offer_id] = 0
                for i in range(len(events)-2):
                    # check if the transaction sequence is offer received -> offer viewed -> offer completed
                    # if yes, we assume the user reacted positively with the offer we provided.
                    if (events[i] == 'offer received') & (events[i+1] == 'offer viewed') & (events[i+2] == 'offer completed'):
                        user_item_matrix.loc[person, offer_id] += 1
            elif len(events) > 0:
                user_item_matrix.loc[person, offer_id] = 0
    
    # store the large martix into file
    fh = open(filename, 'wb')
    pickle.dump(user_item_matrix,fh)
    fh.close()
    
    return user_item_matrix

In [None]:
start = timeit.default_timer()

create_user_item_matrix(new_transcript, portfolio, 'user_item_matrix.p')

stop = timeit.default_timer()
print('Time: ', stop - start)

In [None]:
# split dataframe into train and test sets
# 70% training, 30% test
train = new_transcript.head(int(round(len(new_transcript)*.70, 0)))
test = new_transcript.tail(len(new_transcript) - int(round(len(new_transcript)*.70, 0)))

In [None]:
def create_test_and_train_user_item(df_train, df_test):
    '''
    INPUT:
    df_train - training dataframe
    df_test - test dataframe
    
    OUTPUT:
    user_item_train - a user-item matrix of the training dataframe 
                      (unique users for each row and unique articles for each column)
    user_item_test - a user-item matrix of the testing dataframe 
                    (unique users for each row and unique articles for each column)
    test_idx - all of the test user ids
    test_arts - all of the test article ids
    
    '''
        
    # use function from above to create user_item matricies for the train and test data
    user_item_train = create_user_item_matrix(df_train)
    user_item_test = create_user_item_matrix(df_test)
    
    # get user ids in training and test sets
    train_idx = set(user_item_train.index)
    test_idx = set(user_item_test.index)
    
    # get all offers for the training and test sets
    train_offers = set(user_item_train.columns)
    test_offers =  set(user_item_test.columns)
    
    # find out which user_ids are in both the training and test sets
    # match_idx = list(train_idx.intersection(test_idx))
    
    
    return user_item_train, user_item_test, test_idx, test_offers

user_item_train, user_item_test, test_idx, test_offers = create_test_and_train_user_item(train, test)

In [None]:
# fit SVD on the user_item_train matrix
u_train, s_train, vt_train = np.linalg.svd(user_item_train)# fit svd similar to above then use the cells below

In [None]:
# get row and column indices for user ids and articles for the test set
row_idxs = user_item_train.index.isin(test_idx)
col_idxs = user_item_train.columns.isin(test_offers)

In [None]:
# get u and vt for the test data
u_test = u_train[row_idxs, :]
vt_test = vt_train[:, col_idxs]

In [None]:
u_test

In [None]:
vt_test

In [None]:
# Use these cells to see how well you can use the training 
# decomposition to predict on test data
print(u_train.shape, s_train.shape, vt_train.shape)

# u matrix columns are the ideal number of latent features to keep (10334)
# however, we have contraints, so vt is the maximum number of latent features we can keep
# therefore, will change the u matrix to only include what we are constrained by (10)
u_train_new = u_train[:, : len(s_train)]

# also want s to be a diagnoal. currently just a regular array
s_train_new = np.diag(s_train)

# vt stays the same since already 10x10
vt_train_new = vt_train

# verify that new u matrix has the number of columns changed to fit constraint
# expect a 10334x10 matrix
u_train_new.shape #matched

In [None]:
num_latent_feats = np.arange(1,11)

sum_errs_train = []
sum_errs_test = []
all_errs = []

for k in num_latent_feats:
    # restructure with k latent features
    s_train_k, u_train_k, vt_train_k = np.diag(s_train[:k]), u_train[:, :k], vt_train[:k, :]
    u_test_k, vt_test_k = u_test[:, :k], vt_test[:k, :]
    
    # take dot product
    user_item_train_preds = np.around(np.dot(np.dot(u_train_k, s_train_k), vt_train_k))
    user_item_test_preds =  np.around(np.dot(np.dot(u_test_k, s_train_k), vt_test_k))
    all_errs.append(1 - ((np.sum(user_item_test_preds)+np.sum(np.sum(user_item_test)))/(user_item_test.shape[0]*user_item_test.shape[1])))
    
    # compute error for each prediction to actual value
    diffs_train = np.subtract(user_item_train, user_item_train_preds)
    diffs_test = np.subtract(user_item_test, user_item_test_preds)
    
    # squared error for predicted vs. actual
    sum_square_errs_train = np.sum(np.sum(diffs_train))
    sum_square_errs_test = np.sum(np.sum(diffs_test))
    
    
    # total errors and keep track of them
    train_errors = np.sum(np.sum(np.abs(diffs_train)))
    sum_errs_train.append(train_errors)
    
    test_errors = np.sum(np.sum(np.abs(diffs_test)))
    sum_errs_test.append(test_errors)
    
    
plt.plot(num_latent_feats, 1 - np.array(sum_errs_train)/(user_item_train.shape[0]*user_item_test.shape[1]), label = 'Train');
plt.plot(num_latent_feats, 1 - np.array(sum_errs_test)/(user_item_test.shape[0]*user_item_test.shape[1]), label='Test');
#plt.plot(num_latent_feats, all_errs, label='All Data');
plt.xlabel('Number of Latent Features');
plt.ylabel('Accuracy');
plt.title('Accuracy vs. Number of Latent Features');
plt.legend();

## Logistic Regression Model

Why Logistic Regression?
- This problem can be approached with a binary target outcome variable: Was the offer successful of was it not successful? If a person received, then viewed, then completed the offer, then it would be considered successful


Logistic regression also takes in many input variables to determine if a new offer will be successful or not when a customer is introduced to a new offer.
 

See columns to determinie what needs to be changed for logistic regression

In [None]:
# drop became member on since will use member_len as input
# member_len derived from new_became_member_on, which was derived from became_member_on
logistic_data = new_transcript
#logistic_data.drop(columns =['became_member_on', 'id', 'new_became_member_on'], inplace = True)
logistic_data = pd.concat([logistic_data, pd.get_dummies(new_transcript['Age_Gen'])], axis = 1)
logistic_data.drop(columns=['Age_Gen'], inplace = True)

In [None]:
#encode gender
logistic_data = pd.concat([logistic_data, pd.get_dummies(new_transcript['gender'])], axis = 1)
logistic_data.drop(columns=['gender'], inplace = True)

In [None]:
#encode event
logistic_data = pd.concat([logistic_data, pd.get_dummies(new_transcript['event'])], axis = 1)
logistic_data.drop(columns=['event'], inplace = True)

In [None]:
#encode offer
logistic_data = pd.concat([logistic_data, pd.get_dummies(new_transcript['offer_id'])], axis = 1)
logistic_data.drop(columns=['offer_id'], inplace = True)

Prepare data

In [None]:
logistic_data.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#fill target_var with 0
logistic_data['target_var'].fillna(0, inplace = True)

# get x var
x_vars = logistic_data.fillna(-9999).drop(columns=['target_var', 'person'])
# get y/target var
y_var = logistic_data.fillna(-9999)['target_var']

Build Model and Split data

In [None]:
log = LogisticRegression()

X_train, X_test, y_train, y_test = train_test_split(x_vars, y_var, train_size = .7, random_state = 50)

In [None]:
log.fit(X_train, y_train)

In [None]:
predictions = log.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))