# Modeling
We use a graphical model to understand the 'sequence' of user actions.
- We compute a transition graph for each user.
    - nodes: 128 actions
    - edges: transition counts for adjacent actions. We normalize for each user.
- Training: We build 2 training matrices: average graph for users with bookings and without bookings.
- Prediction: We look at a new user's sequence of actions, then compare it to trained 2 matrices. Metric could be 'simple count', 'Euclidean distance', 'correlation', etc. 

# TODO
- Performance is still not great. We haven't got a good metric yet.

Reference:
- https://pkghosh.wordpress.com/2015/07/06/customer-conversion-prediction-with-markov-chain-classifier/

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('train_users_2.csv')
df_s = pd.read_csv('sessions.csv')

In [4]:
df = df[df.country_destination.isin(['NDF', 'US'])]

df.age.fillna(1000, inplace=True)
df['ageCat'] = pd.cut(df['age'], 
                        bins=[0,10,20,30,40,50,60,70,80,90,100,10000])
df_s['action_detail'].fillna('-unknown-', inplace=1)
df_s['action_type'].fillna('-unknown-', inplace=1)

In [5]:
ids_with_session = df[df.id.isin(df_s.user_id)].id
df = df[df.id.isin(ids_with_session)]
df_s = df_s[df_s.user_id.isin(ids_with_session)]

In [6]:
actions = [x for x in df_s['action_detail'].unique()]
hash_action_idx = {col:idx for idx, col in enumerate(actions)}
n = len(actions)
n

128

In [7]:
hash_id_book = {id:dest for id, dest in zip(df.id.values, df.country_destination.values)}

# Build average adjacent matrices

In [8]:
import pyprind

# main adjacent matrices we want to build
mat_book = np.zeros((n,n), dtype=np.float32)
mat_nobook = np.zeros((n,n), dtype=np.float32)

gp = df_s.groupby('user_id')
pbar = pyprind.ProgBar(len(gp))
for id, df_id in gp:
    v = df_id.values
    n_action = v.shape[0]
    book = hash_id_book[id]
    
    mat = np.zeros((n,n), dtype=np.float32)
    for i in range(1, v.shape[0]):
        prev_action = v[i-1,3]
        cur_action = v[i,3]
        prev_idx = hash_action_idx[prev_action]
        cur_idx = hash_action_idx[cur_action]
        mat[prev_idx, cur_idx] += (1/n_action)
        
    if book == 'NDF':
        mat_nobook += mat
    else:
        mat_book += mat
    pbar.update()
    
mat_book = mat_book/df.country_destination.value_counts()['US']*100
mat_nobook = mat_nobook/df.country_destination.value_counts()['NDF']*100

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:26


In [9]:
mat_book

array([[ 0.67418551,  0.11415691,  0.5486362 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.10299428,  0.01673771,  0.06297693, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.55753076,  0.05978298,  0.02143256, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [10]:
mat_nobook

array([[  5.50632298e-01,   1.07522182e-01,   3.83264452e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  9.62136313e-02,   2.04707794e-02,   6.28685057e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.86171341e-01,   6.79408908e-02,   1.43057769e-02, ...,
          4.82652067e-05,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  4.62541539e-05,   0.00000000e+00,   0.00000000e+00, ...,
          2.07495268e-05,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   7.23191943e-06,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   2.96026592e-05]], dtype=float32)

# Train accuracy

In [11]:
def simple_count_score(df_user):
    v = df_user.values
    n_action = v.shape[0]
    
    book_score = 0
    for i in range(1, v.shape[0]):
        prev_action = v[i-1,3]
        cur_action = v[i,3]
        prev_idx = hash_action_idx[prev_action]
        cur_idx = hash_action_idx[cur_action]
        
        m1 = mat_book[prev_idx, cur_idx]
        m2 = mat_nobook[prev_idx, cur_idx]
        
        if m1 > m2:
            book_score += 1
        else:
            book_score -= 1
    return book_score/n_action

In [12]:
import pyprind

gp = df_s.groupby('user_id')
pbar = pyprind.ProgBar(len(gp))

scores = []
for id, df_user in gp:
    book = hash_id_book[id]
    book_score = simple_count_score(df_user)
    scores.append((book,book_score))

    pbar.update()    

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:17


In [16]:
df_book_score = pd.DataFrame(scores, columns=['book', 'book_score'])

In [18]:
df_book_score.head()

Unnamed: 0,book,book_score
0,US,0.375
1,NDF,-0.9
2,NDF,-0.967742
3,US,-0.286439
4,US,-0.515337


In [37]:
def map_book(s):
    if s == 'US':
        return 1
    else:
        return 0
    
def map_book_score(t):
    if t >= 0.1:
        return 1
    else:
        return 0
    
df_book_score['y_true'] = df_book_score['book'].apply(map_book)
df_book_score['y_pred'] = df_book_score['book_score'].apply(map_book_score)

In [38]:
df_book_score.head()

Unnamed: 0,book,book_score,y_true,y_pred
0,US,0.375,1,1
1,NDF,-0.9,0,0
2,NDF,-0.967742,0,0
3,US,-0.286439,1,0
4,US,-0.515337,1,0


In [39]:
y_true = df_book_score.y_true.values
y_pred = df_book_score.y_pred.values

In [40]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print('accuracy: %.2f %%' %(accuracy_score(y_true, y_pred)*100))
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

accuracy: 62.66 %
[[27971 17070]
 [ 7255 12840]]
             precision    recall  f1-score   support

          0       0.79      0.62      0.70     45041
          1       0.43      0.64      0.51     20095

avg / total       0.68      0.63      0.64     65136

