In [372]:
# Importing libraries
import pandas as pd
from os import listdir
from os.path import isfile, join
import xgboost as xgb

# Reading the data
data_path = 'Data/'
log = pd.read_csv(data_path + 'LOG_WEB_201708.csv', sep=';')
clients = pd.read_csv(data_path + 'clients.csv', sep = ';')
cmd = pd.read_csv(data_path + 'CMD.csv', sep = ';')
cmd = cmd[cmd['ORDER_DATE'] >= '2017-08-01']
table = pd.read_csv(data_path + 'TABLE_CONVERSION.csv', sep = ';')

# Remove unamed column
log = log.loc[:, ~log.columns.str.contains('^Unnamed')]
clients = clients.loc[:, ~clients.columns.str.contains('^Unnamed')]
cmd = cmd.loc[:, ~cmd.columns.str.contains('^Unnamed')]
table = table.loc[:, ~table.columns.str.contains('^Unnamed')]

# Adjust data types - wrong types were causing issues
clients.CLIENT_NUMBER = [str(clients) for clients in clients.CLIENT_NUMBER]
cmd.CLIENT_NUMBER = [str(cmd) for cmd in cmd.CLIENT_NUMBER]
table.CLIENT_NUMBER = [str(table) for table in table.CLIENT_NUMBER]
table.VISITOR_ID = [str(table) for table in table.VISITOR_ID]
log.VISITOR_ID = [str(log) for log in log.VISITOR_ID]

# Cannot distinguish between multiple orders on the same day, so the table has been simplified.
cmd = cmd.groupby(['CLIENT_NUMBER', 'ORDER_DATE', 'CVIC']).sum()
# Convert back to typical DataFrame type
cmd = cmd.reset_index()

# Join cmd + table + log

In [374]:
cmd.CLIENT_NUMBER = [str(cmd) for cmd in cmd.CLIENT_NUMBER]
log['EVENT_SHORT'] = log.EVENT_DATE.str.slice(0,10)
data = log[log.VISITOR_ID != '0']
data = pd.merge(data, table, left_on='VISITOR_ID', right_on='VISITOR_ID', how='left')
data = pd.merge(data, cmd, left_on=['CLIENT_NUMBER', 'EVENT_SHORT'], right_on=['CLIENT_NUMBER', 'ORDER_DATE'], how='left')

In [376]:
data

Unnamed: 0,VISITOR_ID,CONNECTED_SESSION,ID_SESSION,SESSION_START_DATE,EVENT_DATE,PAGES,GLOBAL_SOURCES,DEVICE_TYPE,DEVICE_MODEL,LOADINGS,EVENT_SHORT,CLIENT_NUMBER,ORDER_DATE,CVIC,ORDER_NUMBER,PRE_TAX_AMOUNT
0,381225,OUI,1,2017-08-08 00:02:06,2017-08-08 00:02:06,accueil,Liens Sponsorisés,Smartphone,,1,2017-08-08,2912426551072989703,,,,
1,563053,OUI,13,2017-08-08 00:01:32,2017-08-08 00:03:27,accueil,Accès Direct,Ordinateur,,1,2017-08-08,-5859629033836166974,2017-08-08,False,20279565.0,280.34
2,563053,OUI,13,2017-08-08 00:01:32,2017-08-08 00:22:04,accueil,Accès Direct,Ordinateur,,1,2017-08-08,-5859629033836166974,2017-08-08,False,20279565.0,280.34
3,706588,OUI,23,2017-08-08 00:01:29,2017-08-08 00:01:32,accueil,Accès Direct,Ordinateur,,1,2017-08-08,-7174350147193854740,2017-08-08,False,10139782.0,19.08
4,572702,OUI,64,2017-08-08 00:03:19,2017-08-08 00:03:19,accueil,Moteurs,Tablette,101 Oxygen,1,2017-08-08,866042539231455965,2017-08-08,True,10139780.0,97.50
5,572702,OUI,64,2017-08-08 00:03:19,2017-08-08 00:09:42,accueil,Moteurs,Tablette,101 Oxygen,1,2017-08-08,866042539231455965,2017-08-08,True,10139780.0,97.50
6,572702,OUI,64,2017-08-08 00:03:19,2017-08-08 00:10:36,accueil,Moteurs,Tablette,101 Oxygen,1,2017-08-08,866042539231455965,2017-08-08,True,10139780.0,97.50
7,698042,OUI,201,2017-08-08 00:14:41,2017-08-08 00:14:41,accueil,Liens Sponsorisés,Smartphone,sm-j510fn,1,2017-08-08,3565018203965887144,,,,
8,489527,OUI,203,2017-08-08 00:15:00,2017-08-08 00:15:00,accueil,Liens Sponsorisés,Ordinateur,,1,2017-08-08,5954669164526364513,,,,
9,489527,OUI,203,2017-08-08 00:15:00,2017-08-08 00:15:40,accueil,Liens Sponsorisés,Ordinateur,,1,2017-08-08,5954669164526364513,,,,


# Visitor Journey

Looking at the logs of **August 2017** <br>
**Visitor 261647** first connects at **11:25 AM**
<br> <br>
Connected through **Sponsored Links**, on a **Computer** <br>
**19 minutes:** Address entered for delivery.  <br>
**22 minutes:** Accessed via phone to view account <br>
**28 minutes:** Puchase complete.<br>
**28 minutes:** Disconnected from website
<br> <br>
**187.50 Euro Spent**

# What do I want to classify?
If a customer was convinced to purchase through magazine or online.

# Useful Features
- PAGES: Are they browsing the wares or going straight to the purchase?<br>
- GLOBAL_SOURCES: What caused them to connect to the website? <br>
- DEVICE_TYPE: Maybe magazine users use different devices <br>
- CVIC - Did they use a magazine coupon <br>
- PRE_TAX_AMOUNT <br>

# Feature Engineering

- time connected <br>
- number of unique pages visited
- T/F: Placed order 

# Generating the Data Matrix
- VISITOR_ID <br>
- CLIENT_NUMBER <br>
- EVENT_SHORT <br>
- GLOBAL_SOURCES <br>
- DEVICE TYPE <br>
- CVIC <br>
- PRE_TAX_AMOUNT <br>


In [419]:
tmp = data[['VISITOR_ID', 'CLIENT_NUMBER', 'EVENT_SHORT', 'GLOBAL_SOURCES', 'DEVICE_TYPE', 'CVIC', 'PRE_TAX_AMOUNT', 'EVENT_DATE', 'PAGES']].copy()
tmp = tmp.fillna(0)
tmp['PURCHASE'] = True
tmp.loc[tmp.PRE_TAX_AMOUNT == 0, 'PURCHASE'] = False
tmp.columns = ['VISITOR_ID', 'CLIENT_NUMBER', 'DATE', 'SOURCE', 'DEVICE', 'PROMO', 'EXPENSE', 'EVENT_DATE', 'PAGES', 'PURCHASE']
tmp = tmp[['VISITOR_ID', 'CLIENT_NUMBER', 'DATE', 'SOURCE', 'DEVICE', 'PROMO', 'EXPENSE', 'PURCHASE', 'EVENT_DATE', 'PAGES']]

In [433]:
len(set(tmp.CLIENT_NUMBER))

22451

In [421]:
tmp.head()

Unnamed: 0,VISITOR_ID,CLIENT_NUMBER,DATE,SOURCE,DEVICE,PROMO,EXPENSE,PURCHASE
0,381225,2912426551072989703,2017-08-08,Liens Sponsorisés,Smartphone,0,0.0,False
1,563053,-5859629033836166974,2017-08-08,Accès Direct,Ordinateur,False,280.34,True
2,563053,-5859629033836166974,2017-08-08,Accès Direct,Ordinateur,False,280.34,True
3,706588,-7174350147193854740,2017-08-08,Accès Direct,Ordinateur,False,19.08,True
4,572702,866042539231455965,2017-08-08,Moteurs,Tablette,True,97.5,True


# Testing

In [431]:
tmp[tmp.PURCHASE == True]

In [432]:
tmp

Unnamed: 0,VISITOR_ID,CLIENT_NUMBER,DATE,SOURCE,DEVICE,PROMO,EXPENSE,PURCHASE,EVENT_DATE,PAGES
0,381225,2912426551072989703,2017-08-08,Liens Sponsorisés,Smartphone,0,0.00,False,2017-08-08 00:02:06,accueil
1,563053,-5859629033836166974,2017-08-08,Accès Direct,Ordinateur,False,280.34,True,2017-08-08 00:03:27,accueil
2,563053,-5859629033836166974,2017-08-08,Accès Direct,Ordinateur,False,280.34,True,2017-08-08 00:22:04,accueil
3,706588,-7174350147193854740,2017-08-08,Accès Direct,Ordinateur,False,19.08,True,2017-08-08 00:01:32,accueil
4,572702,866042539231455965,2017-08-08,Moteurs,Tablette,True,97.50,True,2017-08-08 00:03:19,accueil
5,572702,866042539231455965,2017-08-08,Moteurs,Tablette,True,97.50,True,2017-08-08 00:09:42,accueil
6,572702,866042539231455965,2017-08-08,Moteurs,Tablette,True,97.50,True,2017-08-08 00:10:36,accueil
7,698042,3565018203965887144,2017-08-08,Liens Sponsorisés,Smartphone,0,0.00,False,2017-08-08 00:14:41,accueil
8,489527,5954669164526364513,2017-08-08,Liens Sponsorisés,Ordinateur,0,0.00,False,2017-08-08 00:15:00,accueil
9,489527,5954669164526364513,2017-08-08,Liens Sponsorisés,Ordinateur,0,0.00,False,2017-08-08 00:15:40,accueil


# VISITOR JOURNEY

In [230]:
v_17 = log[log.VISITOR_ID == '175752']
v_17.head()

Unnamed: 0,VISITOR_ID,CONNECTED_SESSION,ID_SESSION,SESSION_START_DATE,EVENT_DATE,PAGES,GLOBAL_SOURCES,DEVICE_TYPE,DEVICE_MODEL,LOADINGS
159645,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:27:43,accueil,Email marketing,Ordinateur,,1
159646,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:28:14,accueil,Email marketing,Ordinateur,,1
159647,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:41:45,accueil,Email marketing,Ordinateur,,1
182418,175752,OUI,14817,2017-08-27 10:17:25,2017-08-27 10:17:25,accueil,Accès Direct,Ordinateur,,1
184863,175752,OUI,33484,2017-08-27 14:46:12,2017-08-27 14:46:12,accueil,Accès Direct,Ordinateur,,1


In [235]:
v_17.sort_values(by=['EVENT_DATE'])

Unnamed: 0,VISITOR_ID,CONNECTED_SESSION,ID_SESSION,SESSION_START_DATE,EVENT_DATE,PAGES,GLOBAL_SOURCES,DEVICE_TYPE,DEVICE_MODEL,LOADINGS
159645,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:27:43,accueil,Email marketing,Ordinateur,,1
2091135,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:27:51,ete-indien-2017,Email marketing,Ordinateur,,1
159646,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:28:14,accueil,Email marketing,Ordinateur,,1
278822,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:28:18,nouvelle-collection-automne-hiver-2017,Email marketing,Ordinateur,,1
334433,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:28:30,category 21::fiche produit::6037865839848578407,Email marketing,Ordinateur,,1
667488,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:31:09,ajout au panier,Email marketing,Ordinateur,,1
445710,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:31:30,mon panier,Email marketing,Ordinateur,,1
445711,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:31:55,mon panier,Email marketing,Ordinateur,,1
607990,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:32:10,mes informations,Email marketing,Ordinateur,,1
547217,175752,OUI,28914,2017-08-24 13:27:43,2017-08-24 13:32:37,ma livraison,Email marketing,Ordinateur,,1
