In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
tm_sales_3 = pd.read_csv('tm_sales_3.csv')
tm_sales_3.head()

Unnamed: 0,FILIAL_ID;PROD_ID;SUBS_ID;ACT_DTTM
0,3;1499;id5764122;18-03-2020 15:44
1,6;3020;id7642700;15-03-2020 14:21
2,2;5677;id1374509;17-03-2020 11:48


In [3]:
prod_activations_logs = pd.read_csv('prod_activations_logs.csv', sep=';', parse_dates = ['START_DTTM', 'END_DTTM'], dayfirst=True)
tm_sales_1 = pd.read_csv('tm_sales_1.csv', sep=';', parse_dates = ['ACT_DTTM'], dayfirst=True)
tm_sales_2 = pd.read_csv('tm_sales_2.csv', sep=';', parse_dates = ['ACT_DTTM'], dayfirst=True)
tm_sales_3 = pd.read_csv('tm_sales_3.csv', sep=';', parse_dates = ['ACT_DTTM'], dayfirst=True)

In [4]:
prod_activations_logs.head()

Unnamed: 0,SUBS_ID,PROD_ID,START_DTTM,END_DTTM
1,id4651830,1954,2020-03-20 14:59:00,2020-12-01 00:00:00
5,id7646509,6431,2020-03-19 13:00:00,2020-03-19 13:03:00
2,id7461794,3310,2020-03-20 17:25:00,2020-12-01 00:00:00
3,id5416547,1743,2020-03-17 10:17:00,2020-03-25 11:00:00
4,id8238421,1859,2020-03-01 11:42:00,2020-03-01 11:43:00


In [5]:
tm_sales_1.head()

Unnamed: 0,SUBS_ID,FILIAL_ID,PROD_ID,ACT_DTTM
0,id4651830,1,1954,2020-03-20 14:59:00
1,7646509,5,6431,2020-03-19 13:00:00
2,id7412683,4,3313,2020-03-22 17:25:00


Check product connections for specific users by merging sales files with system connection logs

In [6]:
tm_sales = pd.concat([tm_sales_1, tm_sales_2, tm_sales_3], ignore_index=True)

In [7]:
tm_sales.shape

(10, 4)

If a row in the sales file does not have a specified *SUBS_ID*, it is skipped


In [8]:
tm_sales = tm_sales.dropna(subset = ['SUBS_ID'])

In [9]:
tm_sales.shape

(9, 4)

If ‘id’ is not at the beginning of *SUBS_ID*, it needs to be added

In [10]:
tm_sales['SUBS_ID'] = tm_sales['SUBS_ID'].apply(lambda x: x if x.startswith('id') else 'id' + x)

In [11]:
tm_sales

Unnamed: 0,SUBS_ID,FILIAL_ID,PROD_ID,ACT_DTTM
0,id4651830,1,1954,2020-03-20 14:59:00
1,id7646509,5,6431,2020-03-19 13:00:00
2,id7412683,4,3313,2020-03-22 17:25:00
3,id5416547,3,1743,2020-03-17 10:17:00
5,id8362218,7,9879,2020-03-05 11:42:00
6,id2185490,2,3210,2020-03-16 16:28:00
7,id5764122,3,1499,2020-03-18 15:44:00
8,id7642700,6,3020,2020-03-15 14:21:00
9,id1374509,2,5677,2020-03-17 11:48:00


In [12]:
prod_activations_logs.dtypes

SUBS_ID               object
PROD_ID               object
START_DTTM    datetime64[ns]
END_DTTM      datetime64[ns]
dtype: object

In [13]:
prod_activations_logs['PROD_ID'] = prod_activations_logs['PROD_ID'].astype(int)

In [14]:
sales_logs = prod_activations_logs.merge(tm_sales, on = ['SUBS_ID', 'PROD_ID'])

In [15]:
sales_logs

Unnamed: 0,SUBS_ID,PROD_ID,START_DTTM,END_DTTM,FILIAL_ID,ACT_DTTM
0,id4651830,1954,2020-03-20 14:59:00,2020-12-01 00:00:00,1,2020-03-20 14:59:00
1,id7646509,6431,2020-03-19 13:00:00,2020-03-19 13:03:00,5,2020-03-19 13:00:00
2,id5416547,1743,2020-03-17 10:17:00,2020-03-25 11:00:00,3,2020-03-17 10:17:00
3,id2185490,3210,2020-03-16 16:28:00,2020-12-01 00:00:00,2,2020-03-16 16:28:00
4,id7642700,3020,2020-03-15 14:21:00,2020-03-15 23:42:00,6,2020-03-15 14:21:00


In [16]:
sales_logs['difference'] = sales_logs['END_DTTM'] - sales_logs['START_DTTM']

In [17]:
sales_logs

Unnamed: 0,SUBS_ID,PROD_ID,START_DTTM,END_DTTM,FILIAL_ID,ACT_DTTM,difference
0,id4651830,1954,2020-03-20 14:59:00,2020-12-01 00:00:00,1,2020-03-20 14:59:00,255 days 09:01:00
1,id7646509,6431,2020-03-19 13:00:00,2020-03-19 13:03:00,5,2020-03-19 13:00:00,0 days 00:03:00
2,id5416547,1743,2020-03-17 10:17:00,2020-03-25 11:00:00,3,2020-03-17 10:17:00,8 days 00:43:00
3,id2185490,3210,2020-03-16 16:28:00,2020-12-01 00:00:00,2,2020-03-16 16:28:00,259 days 07:32:00
4,id7642700,3020,2020-03-15 14:21:00,2020-03-15 23:42:00,6,2020-03-15 14:21:00,0 days 09:21:00


A sale is not counted if the disconnection *(END_DTTM)* occurred less than 5 minutes after the connection *(START_DTTM)*

In [18]:
result = sales_logs.loc[sales_logs.difference >= pd.Timedelta(5, 'm')]

In [19]:
result

Unnamed: 0,SUBS_ID,PROD_ID,START_DTTM,END_DTTM,FILIAL_ID,ACT_DTTM,difference
0,id4651830,1954,2020-03-20 14:59:00,2020-12-01 00:00:00,1,2020-03-20 14:59:00,255 days 09:01:00
2,id5416547,1743,2020-03-17 10:17:00,2020-03-25 11:00:00,3,2020-03-17 10:17:00,8 days 00:43:00
3,id2185490,3210,2020-03-16 16:28:00,2020-12-01 00:00:00,2,2020-03-16 16:28:00,259 days 07:32:00
4,id7642700,3020,2020-03-15 14:21:00,2020-03-15 23:42:00,6,2020-03-15 14:21:00,0 days 09:21:00


In [20]:
result.SUBS_ID.sort_values().values

array(['id2185490', 'id4651830', 'id5416547', 'id7642700'], dtype=object)

In [21]:
result.to_csv('result_csv')