### Pyspark Example
data wrangling and training binary classifier to predict pairwise simialirity score for record linkage model.

In [1]:
import pyspark

from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql import Window
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.sql.functions import first
from pyspark.sql.functions  import date_format
from pyspark.sql.functions import lit,StringType

from pyspark.sql.functions import row_number,udf,trim, upper, to_date, substring, length, min, when, format_number, dayofmonth, hour, dayofyear,  month, year, weekofyear, date_format, unix_timestamp
from pyspark import SparkConf
from pyspark.sql.functions import coalesce
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import UserDefinedFunction
import datetime
from pyspark.sql.functions import year
from pyspark.sql.functions import datediff,coalesce,lag
from pyspark.sql.functions import when, to_date
from pyspark.sql.functions import date_add
from pyspark.sql.functions import UserDefinedFunction

import traceback
import sys
import time
import math
import datetime


In [2]:
test='pairwise_similarity'

conf = pyspark.SparkConf()
spark = SparkSession.builder \
            .appName(test) \
            .config('spark.sql.codegen.wholeStage', False) \
            .getOrCreate()
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)





In [3]:
# reading in data

productnames_df=sqlContext.read.csv('./sample_30perc.csv',header=True)



In [4]:
productnames_df.show(truncate=False)

+---------+---------------------------------------------------------+---+
|source_id|product                                                  |id |
+---------+---------------------------------------------------------+---+
|5        |display Samsung UE-48H6270 LED Televizyon *              |11 |
|1        |BOH SAMSUNG G850 GALAXY  final                           |42 |
|9        |BOH Stdr1000201 25 1tb Bp Usb 3.0 GUMUS #                |33 |
|13       |sample Samsung G950 Fiyatı #                             |7  |
|18       | Samsung G150F Galaxy Alpha Cep Telefonu #               |84 |
|1        | SAMSUNG G850 GALAXY  final                              |88 |
|16       |$$ LG 42LB670V LED TV #                                  |63 |
|6        |$$ Philips 185inç 193V5LSB2/62 5ms Led Monitör final     |31 |
|17       | Samsung SM-T800 TABS 10.5 White Tablet                  |65 |
|2        |BOH HP Pavilion 11-n000nt Pentium N3540 4GB  *           |44 |
|6        |BOH Philips 185inç 193V5LSB

In [8]:

# create blocking_key

source_id_df3=productnames_df.withColumn('blocking_key',lit('A'))



# dev/testing
# source_id_df3=source_id_df3.sample(0.0010).cache()

def token_create(rwdf):

    from pyspark.sql.functions import regexp_extract, split, coalesce

    rwdf=rwdf.withColumn("product_num", regexp_extract("product", "([0-9]+)",1))


    test_df=rwdf

    
    from pyspark.sql.functions import udf, col, lower, regexp_replace, array_remove
    from pyspark.ml.feature import Tokenizer, StopWordsRemover
    from nltk.stem.snowball import SnowballStemmer

    # removing punctuation and converting to lower case
    test_df=test_df.alias('a').select(col('a.*'),
                              regexp_replace(lower(trim(col('a.product'))), "[^a-zA-Z\\s]", "")\
                              .alias('input_product')
                             )

    # Tokenize text
    tokenizer = Tokenizer(inputCol='input_product', outputCol='tokens')
    df_words_token = tokenizer.transform(test_df)
    

    # Remove stop words
    stop_word_list=['$$','new','sample', 'display','boh','final']

    remover = StopWordsRemover(inputCol='tokens', outputCol='tokens_clean',stopWords=stop_word_list)
    df_words_no_stopw = remover.transform(df_words_token).select(col('*'),
                                                                 array_remove(col('tokens_clean'),'').alias('tokens_full')
                                                                )

    df_words_no_stopw = df_words_no_stopw[[x for x in df_words_no_stopw.columns if x not in ['input_product', 'tokens', 'tokens_clean']]]


    df_words_no_stopw = df_words_no_stopw\
    .select("*",*(coalesce(col('tokens_full').getItem(i),lit(""))\
                            .alias('product{}'.format(i+1)) for i in range(2))).drop("tokens_full")

    return df_words_no_stopw

# creating label features

source_id_df4=token_create(source_id_df3)


# cartesian join issue hack

spark.conf.set("spark.sql.crossJoin.enabled", True)



# create pairs

label_df=source_id_df4.alias('a')\
.join(
source_id_df4.alias('b'),
    col('a.blocking_key')==col('b.blocking_key'),
    how='inner'
)\
.select(
col('a.product').alias('product_L'),
    col('a.product_num').alias('product_num_L'),
    col('a.product1').alias('product1_L'),
    col('a.product2').alias('product2_L'),
    col('a.source_id').alias('label_L'),
    col('a.id').alias('id_L'),
    col('b.product').alias('product_R'),
    col('b.product_num').alias('product_num_R'),
    col('b.product1').alias('product1_R'),
    col('b.product2').alias('product2_R'),
    col('b.source_id').alias('label_R'),
    col('b.id').alias('id_R')
)




match_df=label_df.alias('a')\
.select(col('a.*'),
       when(col('label_L')==col('label_R'),1).otherwise(0).alias('label_target')
       )

from pyspark.sql.functions import length,levenshtein

#### ADD ANY NEW STRING FEATURES HERE TO CALCULATE EDIT DISTANCE ####

col_list=['product',
          'product1',
          'product2']

for x in range(len(col_list)):

    if x==0:

        match_df2=match_df.withColumn(col_list[x],coalesce(levenshtein(coalesce(col_list[x]+'_L',lit('N/A')),coalesce(col_list[x]+'_R',lit('N/A')))/length(col_list[x]+'_L'),lit(0.9999)))

    else:

        match_df2=match_df2.withColumn(col_list[x],coalesce(levenshtein(coalesce(col_list[x]+'_L',lit('N/A')),coalesce(col_list[x]+'_R',lit('N/A')))/length(col_list[x]+'_L'),lit(0.9999)))

### ADD CATEGORICAL OR BINARY FEATURES HERE TO STRING INDEX ###

match_df2=match_df2[[col_list+['product_num_L','product_num_R','label_target']+['id_L','id_R']]]


# train binary classifier to predict pairwise matches
# NOTE: calc fuzzy match scores between L and R features, 
# feed those as features to binary classifier model
# can use levenshtein distance

# string indexing some columns

def add_string_index(df,index_cols):


    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer

    indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in index_cols ]


    pipeline = Pipeline(stages=indexers)
    df_r = pipeline.fit(df).transform(df)

    df_n=df_r[[x for x in df_r.columns if x not in index_cols]]


    for x in index_cols:

        df_n=df_n.withColumnRenamed(x+'_index',x)

    return df_n


### ADD STRING OR CATEGORICAL FEATURES HERE TO STING INDEX ###

match_df2=add_string_index(df=match_df2,index_cols=['product_num_L','product_num_R'])

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer


features=[x for x in match_df2.columns if x not in ['label_target','id_L','id_R']]

assembler = VectorAssembler(inputCols=features,outputCol="features")

output = assembler.transform(match_df2)

from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier


# NOTE: keeping id_L, id_R to later join on to get similarity score result
final_data = output.select('features','label_target','id_L','id_R')



from pyspark.ml.classification import GBTClassifier
sim_score = GBTClassifier(labelCol='label_target',
 featuresCol='features',
  maxIter=10,
  maxBins=3000,
  maxDepth=6)






In [9]:
# fit model

# set seed to keep training consistent
sim_score.setSeed(123)

sim_score_model = sim_score.fit(final_data)


# save trained model for future use
sim_score_model.write().overwrite().save('./'+test+'_model/')