In [6]:
import pandas as pd
import re
from collections import Counter, OrderedDict
import numpy as np
from sklearn.preprocessing import LabelEncoder
# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# lemma = WordNetLemmatizer()
# from sklearn.feature_extraction.text import CountVectorizer

# Merchants data preprocessing

In [7]:
merchants_df = pd.read_parquet('../data/tables/tbl_merchants.parquet').reset_index()

In [8]:
merchants_df.head(1).loc[0, 'tags']

'((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))'

### Remove invalid ABNS
All have length 11

In [9]:
merchants_df = merchants_df[merchants_df['merchant_abn'].astype(str).str.len() == 11]

In [10]:
#First separte text into 3 separate features (separated by () or [])
def separate_tags(row):
    features = re.findall(r'[\(|\[][\(|\[](.*)[\)|\]],\s[\(|\[](.*)[\)|\]],\s[\(|\[](.*)[\)|\]][\)|\]]', row['tags'])
    row['feature_1'] = features[0][0]
    row['feature_2'] = features[0][1]
    row['feature_3'] = features[0][2]
    return row
merchants_df = merchants_df.apply(separate_tags, axis = 1)

In [11]:
# convert the take rate feature (feature_2) to float type
def get_take_rate(x):
    feature = re.findall('take rate: (\d+\.\d+)', x)
    return float(feature[0])/100

merchants_df['feature_3'] = merchants_df['feature_3'].apply(get_take_rate)
merchants_df = merchants_df.rename(columns = {'feature_2': 'revenue_level', 'feature_1': 'category', 'feature_3' : 'take_rate'})
merchants_df = merchants_df.drop(columns = 'tags')

In [12]:
# 971 Unique instances of the unpreprocessed tags
# Preprocessing 'category' occurs in this cell

merchants_df['category'] = merchants_df['category'].str.lower()
merchants_df['category'] = merchants_df['category'].str.split(' and ')

new_category = []
for e in merchants_df['category']:
    new = ', '.join(e)
    new_category.append(new)
merchants_df['category'] = new_category

merchants_df['category'] = merchants_df['category'].str.split(',')

new_category = []
for e in merchants_df['category']: 
    # # Lemmatizing
    # for i in e:
    #     new_e.append(lemma.lemmatize(i))
    
    new_e = []
    # Removing leading and trailing whitespace
    for i in e:
        word = i.lstrip().rstrip()
        word = re.sub(r'\s{2,}', ' ', word)
        new_e.append(word)

    # Removing empty options in list
    removeIndex = []
    for i in range(len(new_e)):
        if not new_e[i]:
            removeIndex.insert(0, i)
    for i in removeIndex:
        new_e.pop(i)     

    new_category.append(new_e)

merchants_df['category'] = new_category

In [13]:
merged_categories = []
for e in merchants_df['category']:
    merged_categories.extend(e)
print(len(merged_categories))
C = Counter(merged_categories)
# print(len(C))
# print(C)

11731


In [14]:
le = LabelEncoder()
merchants_df['category_indexed'] = le.fit_transform(merchants_df['category'].astype(str))

### Assigning Segments

In [15]:
segments_df = pd.read_csv('../data/curated/segments.csv')
segments_df = segments_df.set_index('category_indexed')
segments_df.head(25)

Unnamed: 0_level_0,category,segment
category_indexed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"['antique shops - sales', 'repairs', 'restorat...",Home
1,"['art dealers', 'galleries']",Art
2,"['artist supply', 'craft shops']",Art
3,"['bicycle shops - sales', 'service']",Home
4,"['books', 'periodicals', 'newspapers']",Media and Technology
5,"['cable', 'satellite', 'other pay television',...",Media and Technology
6,"['computer programming', 'data processing', 'i...",Media and Technology
7,"['computers', 'computer peripheral equipment',...",Media and Technology
8,"['digital goods: books', 'movies', 'music']",Media and Technology
9,"['equipment', 'tool', 'furniture', 'appliance ...",Home


In [17]:
merchants_df.shape

(4026, 6)

In [18]:
merchants_segments_df = merchants_df.join(segments_df, on='category_indexed', how='left', lsuffix='_merchant', rsuffix='_segment')
merchants_segments_df = merchants_segments_df[['merchant_abn','name','category_merchant','revenue_level','take_rate','category_indexed','segment']]
merchants_segments_df.rename(columns = {'category_merchant':'category'}, inplace = True)
merchants_segments_df.head(10)

Unnamed: 0,merchant_abn,name,category,revenue_level,take_rate,category_indexed,segment
0,10023283211,Felis Limited,"[furniture, home furnishings, equipment shops,...",e,0.0018,11,Home
1,10142254217,Arcu Ac Orci Corporation,"[cable, satellite, other pay television, radio...",b,0.0422,5,Media and Technology
2,10165489824,Nunc Sed Company,"[jewelry, watch, clock, silverware shops]",b,0.044,15,Accessories
3,10187291046,Ultricies Dignissim Lacus Foundation,"[watch, clock, jewelry repair shops]",b,0.0329,24,Accessories
4,10192359162,Enim Condimentum PC,"[music shops - musical instruments, pianos, sh...",a,0.0633,18,Art
5,10206519221,Fusce Company,"[gift, card, novelty, souvenir shops]",a,0.0634,12,Miscellaneous
6,10255988167,Aliquam Enim Incorporated,"[computers, computer peripheral equipment, sof...",b,0.0432,7,Media and Technology
7,10264435225,Ipsum Primis Ltd,"[watch, clock, jewelry repair shops]",c,0.0239,24,Accessories
8,10279061213,Pede Ultrices Industries,"[computer programming, data processing, integr...",a,0.0571,6,Media and Technology
9,10323485998,Nunc Inc.,"[furniture, home furnishings, equipment shops,...",a,0.0661,11,Home


In [19]:
merchants_segments_df.shape

(4026, 7)

Export dataframe as parquet

In [20]:
merchants_segments_df.to_parquet('../data/curated/merchants.parquet', index = False)

In [21]:
merchants2_df = pd.read_parquet('../data/curated/merchants.parquet')