In [151]:
from pyspark.sql import functions as F, SparkSession

import os
os.sys.path.append("../")
from scripts.consumer_model import *

from pyspark.sql import functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

In [152]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("segments")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

In [153]:
merchants_info = spark.read.parquet('../data/curated/merchant_info.parquet')

In [154]:
merchants_info.printSchema()

root
 |-- name: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- category: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



Before we do any tokenization, let's see if there are any merchants with the same category description. This would help use categorise their industry easier and faster.

In [155]:
distinct_category = merchants_info.groupBy("category").count()

In [156]:
distinct_category.show(100, truncate=False)

+---------------------------------------------------------------------------------------+-----+
|category                                                                               |count|
+---------------------------------------------------------------------------------------+-----+
|stationery, office  supplies and printing and writing paper                            |2    |
|health  and beauty spas                                                                |1    |
|opticians, optical goods, and eyeglasses                                               |145  |
|books, periodicals, and  newspapers                                                    |5    |
|watch, clock, and jewelry repair shops                                                 |159  |
|computer programming , data processing, and integrated systems design services         |182  |
|jewelry, watch, clock, and  silverware shops                                           |2    |
|opticians, optical  goods, and eyeglass

# Data preprocessing

In [157]:
# Lowercase all text
merchants_info = merchants_info.withColumn("category_clean", F.lower(F.col("category")))

# Tokenize the descriptions (split text into words)
tokenizer = RegexTokenizer(inputCol="category_clean", outputCol="tokens", pattern="\\W")
merchants_info = tokenizer.transform(merchants_info)

# Remove stop words
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
merchants_info = stopwords_remover.transform(merchants_info)

In [158]:
preprocessing_distinct_category = merchants_info.groupBy("filtered_tokens").count()
preprocessing_distinct_category.show(50,truncate=False)

+-----------------------------------------------------------------------------------+-----+
|filtered_tokens                                                                    |count|
+-----------------------------------------------------------------------------------+-----+
|[lawn, garden, supply, outlets, including, nurseries]                              |153  |
|[tent, awning, shops]                                                              |178  |
|[hobby, toy, game, shops]                                                          |142  |
|[equipment, tool, furniture, appliance, rent, al, leasing]                         |134  |
|[shoe, shops]                                                                      |185  |
|[furniture, home, furnishings, equipment, shops, manufacturers, except, appliances]|182  |
|[watch, clock, jewelry, repair, shops]                                             |170  |
|[artist, supply, craft, shops]                                                 

# Text classification

In [159]:
# Manually classify all description into 5 classes
segments_dict = {
    "Computers, Electronics, and Office Supplies": [
        ['computers', 'computer', 'peripheral', 'equipment', 'software'],
        ['computer', 'programming', 'data', 'processing', 'integrated', 'systems', 'design', 'services'],
        ['telecom'],
        ['cable', 'satellite', 'pay', 'television', 'radio', 'services'],
        ['stationery', 'office', 'supplies', 'printing', 'writing', 'paper']
    ],
    "Home, Garden, and Furnishings": [
        ['florists', 'supplies', 'nursery', 'stock', 'flowers'],
        ['lawn', 'garden', 'supply', 'outlets', 'including', 'nurseries'],
        ['tent', 'awning', 'shops'],
        ['equipment', 'tool', 'furniture', 'appliance', 'rent', 'al', 'leasing'],
        ['furniture', 'home', 'furnishings', 'equipment', 'shops', 'manufacturers', 'except', 'appliances']
    ],
    "Books, Media, Arts, Crafts, and Hobbies": [
        ['books', 'periodicals', 'newspapers'],
        ['hobby', 'toy', 'game', 'shops'],
        ['artist', 'supply', 'craft', 'shops'],
        ['digital', 'goods', 'books', 'movies', 'music'],
        ['books', 'periodicals', 'newspapers'],
        ['music', 'shops', 'musical', 'instruments', 'pianos', 'sheet', 'music'],
        ['art', 'dealers', 'galleries'],
        ['gift', 'card', 'novelty', 'souvenir', 'shops']
    ],
    "Fashion, Personal Accessories, Health, and Beauty": [
        ['opticians', 'optical', 'goods', 'eyeglasses'],
        ['health', 'beauty', 'spas'],
        ['shoe', 'shops'],
        ['watch', 'clock', 'jewelry', 'repair', 'shops'],
        ['jewelry', 'watch', 'clock', 'silverware', 'shops']
    ],
    "Vehicles, Repairs, and Miscellaneous Services": [
        ['motor', 'vehicle', 'supplies', 'new', 'parts'],
        ['antique', 'shops', 'sales', 'repairs', 'restoration', 'services'],
        ['bicycle', 'shops', 'sales', 'service'],
        ['tent', 'awning', 'shops']
    ]
}

segment_conditions = [
    (F.col("filtered_tokens").cast("string") == F.array([F.lit(item) for item in value]).cast("string"), F.lit(key))
    for key, values in segments_dict.items() for value in values
]

In [160]:
# Add the 'segments' column based on the dictionary
segments_column = F.lit(None)
for condition, segment in segment_conditions:
    segments_column = F.when(condition, segment).otherwise(segments_column)
merchants_info = merchants_info.withColumn("segments", segments_column)


In [161]:
merchants_info.show(3, truncate=False)

+------------------------+------------+------------------------------------------------------+-------------+---------+------------------------------------------------------+-----------------------------------------------------------+------------------------------------------------------+-------------------------------------------+
|name                    |merchant_abn|category                                              |revenue_level|take_rate|category_clean                                        |tokens                                                     |filtered_tokens                                       |segments                                   |
+------------------------+------------+------------------------------------------------------+-------------+---------+------------------------------------------------------+-----------------------------------------------------------+------------------------------------------------------+-------------------------------------------+
|

In [162]:
# Number of merchants for each classes
num_merchants = merchants_info.groupBy("segments").count()
num_merchants.show(500, truncate=False)

+-------------------------------------------------+-----+
|segments                                         |count|
+-------------------------------------------------+-----+
|Home, Garden, and Furnishings                    |649  |
|Computers, Electronics, and Office Supplies      |833  |
|Books, Media, Arts, Crafts, and Hobbies          |1155 |
|Vehicles, Repairs, and Miscellaneous Services    |628  |
|Fashion, Personal Accessories, Health, and Beauty|761  |
+-------------------------------------------------+-----+



# Save merchants with their segments

In [163]:
merchants_info = merchants_info.select(["name", "merchant_abn", "category", "revenue_level", "take_rate","segments"])

In [164]:
merchants_info.write.parquet("../data/curated/segmented_merchants_info.parquet", mode = 'overwrite')