> Beautiful is better than ugly.  
Explicit is better than implicit.  
Simple is better than complex.  
Complex is better than complicated.  
Flat is better than nested.  

In [16]:
%reload_ext autoreload
%autoreload 2
import os, sys
sys.path.append('../')
import logging
import random
import numpy as np, pandas as pd
import config
from utilities import *
from _rulesbuilding import *

In [17]:
# Load the ruleset
rules = pd.read_csv(config.inputs['rules']['fullpath'])
pd.options.mode.chained_assignment = None  # default='warn'

In [18]:
"""
Requirement
Add the requirements to the regex builder
1. S (start): if the pattern in ​text_match​ 
column is a prefix of the description string, 
a match is found for the corresponding service 
(both ID and name are included in the table)
2. A (anywhere): similar to above, except 
that the pattern doesn’t have to be in the 
beginning of the description
3. R (regular expression): use ​text_match​ 
as a regular regular expression
"""

_rules = rules
_rules = _rules[(_rules['text_match'].notna() == False) | (_rules['text_exclude'].notna() == False)]
_rules['text_match'].fillna('.*', inplace = True)
_rules['text_exclude'].fillna('a^', inplace = True)

_rules['text_exclude'] = '(?!.*(' + _rules['text_exclude'] + ').*).*$'

prefix = np.where(_rules['matching'] == 'A', '^(?=.*(', '^(?=(') 
_rules['text_match'] = prefix + _rules['text_match'] + ').*)' + _rules['text_exclude']

_rules = _rules.groupby('service_id')['text_match'].apply(
            lambda t: '|'.join([str(i) for i in t])
        ).astype(str)

regexes_dict = {}
inv_map = {v: k for k, v in _rules.items()}
regexes_dict = {**regexes_dict, **inv_map}

output(random.choice(list(regexes_dict.items())))

('^(?=(.*).*)(?!.*(a^).*).*$|^(?=(.*).*)(?!.*(a^).*).*$|^(?=(Lim Commercials '
 '3mo).*)(?!.*(a^).*).*$|^(?=(CBS - Fake Cancel).*)(?!.*(a^).*).*$|^(?=(CBS '
 'All Access).*)(?!.*(a^).*).*$|^(?=(Commercial Free 1 '
 'Week).*)(?!.*(a^).*).*$|^(?=(CBS - Fake New).*)(?!.*(a^).*).*$|^(?=.*(All '
 'Access).*)(?!.*(a^).*).*$|^(?=(CBS - Fake '
 'Cancel).*)(?!.*(a^).*).*$|^(?=(Cancellation Confirmation '
 'CBS).*)(?!.*(a^).*).*$|^(?=(CBS - Fake Cancel Passive '
 'Churn).*)(?!.*(a^).*).*$|^(?=.*(CBS).*)(?!.*(a^).*).*$',
 39)


In [19]:
"""
Build the dictionary to map signals from
description text -
The mapping dictionay of signal keywords 
can be configured in ROOT_DIR/config.py
"""

signals = {'^.*(' + ('|'.join(v)) + ').*$' : k for k, v in config.inputs['signals'].items()}

output(signals)

{'^.*(cancelled|cancel).*$': 'cancellation',
 '^.*(coming|back|signup|signing|joining|welcome).*$': 'signup'}


In [20]:
"""
Build the dictionary to map signals from
description text -
The mapping dictionay of signal keywords 
can be configured in ROOT_DIR/config.py
"""


aservices = rules
aservices = aservices[aservices['service_name'].isin(config.inputs['allowed_services'])]
aservices = dict(zip(aservices['service_id'], aservices['service_name']))

output(aservices)

{1: 'Hulu', 3: 'Netflix', 8: 'Starz', 12: 'Showtime', 39: 'CBS All Access'}


## Data Processing ##     

> Args:  
>  1. `regexes_dict (dict of str: int): Defines the regexes that map to service ids`

>  2. `signals (dict of str: int): Defines the regexes that map to signals from the description`

>  3. `aservices (dict of int: str): Defines the service ids that map to allowed service names`



In [21]:
# Load the dataset
data = pd.read_csv(config.inputs['data']['fullpath'])

In [22]:
"""
Requirement
There 3 types of statuses:
1. N (new): this transaction is new
2. U (update): this transaction is 
updated; discard the old one
3. D (delete): remove this transaction
"""

# Remove transactions with status 'D' as per requirements
data = data[data['status'] != 'D']

# Remove transactions that have an older entry
data = data.sort_values('last_updated').drop_duplicates('item_id',keep='last')

In [23]:
# Copy the description column 
data['service_id'] = data[['description']]

# Replace the description in the new service_id
# column with the matching service_id from
# regexes_dict that we created above
data['service_id'] = data[['service_id']].replace({'service_id':regexes_dict}, regex=True)


In [24]:
"""
Requirement
To simplify the project, you only need to 
consider the following services, although
the data may include many others.
● Netflix -> 3
● Hulu -> 1
● CBS All Access  -> 39
● Starz -> 8
● Showtime -> 12
"""
data = data[data['service_id'].isin(list(aservices.keys()))]

doutput(data.head())



# Add the service names corresponding 
# to the service id

# Copy the description column 

data['service_name'] = data[['service_id']]
data['service_name'] = data[['service_name']].replace({'service_name': aservices})


       item_id     buyer_id  order_date      merchant_id  merchant_name    status    last_updated      description                                                                                    service_id
--  ----------  -----------  ------------  -------------  ---------------  --------  ----------------  -------------------------------------------------------------------------------------------  ------------
 1  1191248937  -1039890773  2016-02-19                6  Netflix          N         2016-02-20 11:18  we've cancelled your Netflix Account. This change will be effective Sunday, March 20, 2016.             1
 2  1221099193   -751620046  2016-02-27                6  Netflix          N         2016-02-28 22:26  Thanks for joining Netflix!                                                                             1
 3  1294618642    -78172962  2016-03-18                6  Netflix          N         2016-03-18 16:18  we've cancelled your Netflix Account. This change will be eff

In [25]:
"""
Requirement
You still need to figure out what kind 
of action (signup versus cancellation) 
the remaining transactions are about.
If there are trial signup and cancellation, 
please make your own judgment as to how 
to treat them.
"""
data['signal_type'] = data[['description']]
data['signal_type'] = data[['signal_type']].replace({'signal_type': signals}, regex = True)

In [26]:
# Save the processed dataset locally
save_file(config.outputs['local']['fullpath'], data.to_csv())

**⭣ This is the local link to the processed file: ⭣**

In [27]:
# Output Local URL
output(config.outputs['local']['fullpath'])

'/home/jupyter/data/processed_data.csv'


In [28]:
"""
Save a copy of the processed dataset 
to Google. Use IAM roles for authentication.

Requirement
Output data is accessible from a common 
cloud storage (i.e. AWS S3 or Google Storage)
"""
from google.cloud import storage

client = storage.Client()
bucket = client.get_bucket(config.outputs['cloud']['bucket_name'])
blob = bucket.blob(config.outputs['local']['filename'])
blob.upload_from_filename(config.outputs['local']['fullpath'])


**⭣ This is the Cloud URL to the processed file: ⭣**

In [29]:
# Output Cloud URL
output(blob.public_url)

'https://storage.googleapis.com/antenna-task/processed_data.csv'


In [30]:
# print(data.to_string())