In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("My App")
sc = SparkContext.getOrCreate(conf = conf)
spark=SparkSession.builder.appName('myApp').getOrCreate()

In [2]:
from pyspark.sql.types import StructType,FloatType,IntegerType,BooleanType

schema = StructType() \
      .add("id_1",IntegerType(),True) \
      .add("id_2",IntegerType(),True) \
      .add("cmp_fname_c1",FloatType(),True) \
      .add("cmp_fname_c2",FloatType(),True) \
      .add("cmp_lname_c1",FloatType(),True) \
      .add("cmp_lname_c2",FloatType(),True) \
      .add("cmp_sex",IntegerType(),True) \
      .add("cmp_bd",IntegerType(),True) \
      .add("cmp_bm",IntegerType(),True) \
      .add("cmp_by",IntegerType(),True) \
      .add("cmp_plz",IntegerType(),True) \
      .add("is_match",BooleanType(),False)
     

In [3]:
files=[f'./data/block_{id}.csv' for id in range(1,11)]

In [4]:
df=spark.read.csv(files,header=True
                  ,schema=schema)

In [5]:
df.groupBy('cmp_plz').count().show()

+-------+-------+
|cmp_plz|  count|
+-------+-------+
|   null|  12843|
|      1|  31714|
|      0|5704575|
+-------+-------+



In [6]:
df.count()

5749132

In [7]:
df.printSchema()

root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: float (nullable = true)
 |-- cmp_fname_c2: float (nullable = true)
 |-- cmp_lname_c1: float (nullable = true)
 |-- cmp_lname_c2: float (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)



# Missing Values

### Drop Missing Values


In [8]:
miss_df=df.drop('id_1','id_2')

اگر همه داده های گم شده را حذف کنیم، کلا 20 رکورد باقی می‌ ماند !


In [9]:
miss_df.replace('?',None).na.drop().count()

20

In [10]:
miss_df=miss_df.replace('?',None)

In [11]:
miss_df.head(3)

[Row(cmp_fname_c1=0.8333333134651184, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=0, is_match=True),
 Row(cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True),
 Row(cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True)]

هیچ رکوردی که همه یا حداقل 2 تا از متغیرهای آن گم شده باشد، وجود ندارد

In [12]:
miss_df.na.drop(how='all').count()

5749132

In [13]:
miss_df.na.drop(how='any',thresh=2).count()

5749132

### Fill the Missing Values

In [14]:
from pyspark.ml.feature import Imputer
# for float variables
float_cols=[
    'cmp_fname_c1', 
    'cmp_fname_c2', 
    'cmp_lname_c1', 
    'cmp_lname_c2', 
]
float_imputer = Imputer(
    inputCols=float_cols,
    outputCols=[f"{col}_imputed" for col in float_cols]
).setStrategy('mean')

# for binary variables
binary_cols=[
    'cmp_sex', 
    'cmp_bd', 
    'cmp_bm', 
    'cmp_by',
    'cmp_plz',
]
binary_imputer = Imputer(
    inputCols=binary_cols,
    outputCols=[f"{col}_imputed" for col in binary_cols]
).setStrategy('mode')



In [15]:
imputed_df=float_imputer.fit(miss_df).transform(miss_df)


In [16]:
final_imputed_df=binary_imputer.fit(imputed_df).transform(imputed_df)

In [17]:
final_imputed_df.show()

+------------+------------+------------+------------+-------+------+------+------+-------+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+
|cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|cmp_sex_imputed|cmp_bd_imputed|cmp_bm_imputed|cmp_by_imputed|cmp_plz_imputed|
+------------+------------+------------+------------+-------+------+------+------+-------+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+
|   0.8333333|        null|         1.0|        null|      1|     1|     1|     1|      0|    true|           0.8333333|           0.9000177|                 1.0|          0.31841284|              1|             1|   

In [18]:
final_imputed_df.columns

['cmp_fname_c1',
 'cmp_fname_c2',
 'cmp_lname_c1',
 'cmp_lname_c2',
 'cmp_sex',
 'cmp_bd',
 'cmp_bm',
 'cmp_by',
 'cmp_plz',
 'is_match',
 'cmp_fname_c1_imputed',
 'cmp_fname_c2_imputed',
 'cmp_lname_c1_imputed',
 'cmp_lname_c2_imputed',
 'cmp_sex_imputed',
 'cmp_bd_imputed',
 'cmp_bm_imputed',
 'cmp_by_imputed',
 'cmp_plz_imputed']

In [19]:
prep_df=final_imputed_df.select([x for x in final_imputed_df.columns if '_imputed' in x or x=='is_match'])

In [20]:
prep_df.describe().show()

+-------+--------------------+--------------------+--------------------+--------------------+------------------+-------------------+-------------------+-------------------+-------------------+
|summary|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|   cmp_sex_imputed|     cmp_bd_imputed|     cmp_bm_imputed|     cmp_by_imputed|    cmp_plz_imputed|
+-------+--------------------+--------------------+--------------------+--------------------+------------------+-------------------+-------------------+-------------------+-------------------+
|  count|             5749132|             5749132|             5749132|             5749132|           5749132|            5749132|            5749132|            5749132|            5749132|
|   mean|  0.7129024717725752|  0.9000176786244445|  0.3156278224158526| 0.31841284036351114| 0.955001381078048|0.22443422763645016| 0.4887876987343481|0.22271779461664823|0.00551631098398854|
| stddev| 0.38872431001387875|  0.0

In [21]:
prep_df.columns

['is_match',
 'cmp_fname_c1_imputed',
 'cmp_fname_c2_imputed',
 'cmp_lname_c1_imputed',
 'cmp_lname_c2_imputed',
 'cmp_sex_imputed',
 'cmp_bd_imputed',
 'cmp_bm_imputed',
 'cmp_by_imputed',
 'cmp_plz_imputed']

In [22]:
from pyspark.ml.feature import FeatureHasher,OneHotEncoder,VectorAssembler,StringIndexer

In [23]:
prep_df.withColumn('label',prep_df['is_match'].cast('integer')).show(3,truncate=False)


+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|is_match|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|cmp_sex_imputed|cmp_bd_imputed|cmp_bm_imputed|cmp_by_imputed|cmp_plz_imputed|label|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|true    |0.8333333           |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |0              |1    |
|true    |1.0                 |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |1              |1    |
|true    |1.0                 |0.9000177           |1.0                 |0.31841284          |1              |

In [24]:
from pyspark.sql.functions import when, lit

In [25]:
rep_df = prep_df.withColumn('label', \
                   when(df['is_match']==True, 
    lit(1)).otherwise(0))

In [26]:
rep_df.show(1,truncate=False)

+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|is_match|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|cmp_sex_imputed|cmp_bd_imputed|cmp_bm_imputed|cmp_by_imputed|cmp_plz_imputed|label|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|true    |0.8333333           |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |0              |1    |
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
only showing top 1 row



In [30]:
rep_df[rep_df['label']>0].show(3,truncate=False)

+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|is_match|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|cmp_sex_imputed|cmp_bd_imputed|cmp_bm_imputed|cmp_by_imputed|cmp_plz_imputed|label|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|true    |0.8333333           |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |0              |1    |
|true    |1.0                 |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |1              |1    |
|true    |1.0                 |0.9000177           |1.0                 |0.31841284          |1              |

In [31]:
features=set(rep_df.columns) - set(['label','is_match'])
assembler = VectorAssembler(inputCols=list(features),
                             outputCol='features')

# Consolidate predictor columns
assembled_df = assembler.transform(rep_df)
assembled_df.select('features', 'label').show(5, truncate=False)

+----------------------------------------------------------------------------------+-----+
|features                                                                          |label|
+----------------------------------------------------------------------------------+-----+
|[0.9000176787376404,1.0,1.0,0.0,0.3184128403663635,1.0,1.0,0.8333333134651184,1.0]|1    |
|[0.9000176787376404,1.0,1.0,1.0,0.3184128403663635,1.0,1.0,1.0,1.0]               |1    |
|[0.9000176787376404,1.0,1.0,1.0,0.3184128403663635,1.0,1.0,1.0,1.0]               |1    |
|[0.9000176787376404,1.0,1.0,1.0,0.3184128403663635,1.0,1.0,1.0,1.0]               |1    |
|[0.9000176787376404,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]                              |1    |
+----------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [34]:
train, test = assembled_df.randomSplit([0.7, 0.3], seed=42)
train.count()

4025517

### ML Models

In [35]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics


In [36]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(test)
prediction.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+-----------------------------------------+
|label|prediction|probability                              |
+-----+----------+-----------------------------------------+
|0    |0.0       |[0.9999965769719209,3.423028079099333E-6]|
|0    |0.0       |[0.9999965769719209,3.423028079099333E-6]|
|0    |0.0       |[0.9999965769719209,3.423028079099333E-6]|
|0    |0.0       |[0.9999965769719209,3.423028079099333E-6]|
|0    |0.0       |[0.9999965769719209,3.423028079099333E-6]|
+-----+----------+-----------------------------------------+
only showing top 5 rows



In [37]:

def evaluate(pred):
    pred.groupBy('label', 'prediction').count().show()

    # Calculate the elements of the confusion matrix
    TN = pred.filter('prediction = 0 AND label = prediction').count()
    TP = pred.filter('prediction = 1 AND label = prediction').count()
    FN = pred.filter('prediction = 0 AND label = 1').count()
    FP = pred.filter('prediction = 1 AND label = 0').count()

    # Accuracy measures the proportion of correct predictions
    accuracy = (TN + TP) / (TN + TP + FN + FP)
    recall = (TP) / (TP+FN)
    precision= (TP) / (TP+FP)
    f1=2*(precision*recall)/(precision+recall)
    print('EVALUATION SUMMARY:')
    print(f'accuracy:{accuracy} \nprecision:{precision} \nrecall:{recall} \nf1-score:{f1}')


In [45]:
evaluate(prediction)

+-----+----------+-------+
|label|prediction|  count|
+-----+----------+-------+
|    1|       0.0|     33|
|    0|       0.0|1717323|
|    1|       1.0|   6166|
|    0|       1.0|     93|
+-----+----------+-------+

EVALUATION SUMMARY:
accuracy:0.9999268978281113 
precision:0.9851413963891995 
recall:0.9946765607356025 
f1-score:0.9898860170171776


In [40]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
logistic = LogisticRegression()
lr_pipeline = Pipeline(stages=[ logistic])
lr_model=lr_pipeline.fit(train)
lr_prediction=lr_model.transform(test)
lr_prediction.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+-------------------------------------------+
|label|prediction|probability                                |
+-----+----------+-------------------------------------------+
|0    |0.0       |[0.9999999999783138,2.1686208384608108E-11]|
|0    |0.0       |[0.9999999999262756,7.372435995023352E-11] |
|0    |0.0       |[0.9999999997835478,2.1645218950538947E-10]|
|0    |0.0       |[1.0,0.0]                                  |
|0    |0.0       |[0.9999999999999998,2.220446049250313E-16] |
+-----+----------+-------------------------------------------+
only showing top 5 rows



In [41]:
evaluate(lr_prediction)

+-----+----------+-------+
|label|prediction|  count|
+-----+----------+-------+
|    0|       0.0|1717405|
|    1|       1.0|   6181|
|    0|       1.0|     11|
|    1|       0.0|     18|
+-----+----------+-------+

EVALUATION SUMMARY:
accuracy:0.9999831748969462 
precision:0.9982235142118863 
recall:0.9970963058557832 
f1-score:0.9976595916390929


In [42]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[ rf])
rf_model=pipeline.fit(train)
rf_predictions=rf_model.transform(test)
rf_predictions.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+------------------------------------------+
|label|prediction|probability                               |
+-----+----------+------------------------------------------+
|0    |0.0       |[0.9999644889923797,3.5511007620311594E-5]|
|0    |0.0       |[0.9999720909893208,2.790901067927771E-5] |
|0    |0.0       |[0.9999644889923797,3.5511007620311594E-5]|
|0    |0.0       |[0.9999838082776951,1.6191722304915428E-5]|
|0    |0.0       |[0.9999838082776951,1.6191722304915428E-5]|
+-----+----------+------------------------------------------+
only showing top 5 rows



In [43]:
rf_predictions.columns

['is_match',
 'cmp_fname_c1_imputed',
 'cmp_fname_c2_imputed',
 'cmp_lname_c1_imputed',
 'cmp_lname_c2_imputed',
 'cmp_sex_imputed',
 'cmp_bd_imputed',
 'cmp_bm_imputed',
 'cmp_by_imputed',
 'cmp_plz_imputed',
 'label',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [44]:
evaluate(rf_predictions)

+-----+----------+-------+
|label|prediction|  count|
+-----+----------+-------+
|    1|       0.0|     73|
|    0|       0.0|1717414|
|    1|       1.0|   6126|
|    0|       1.0|      2|
+-----+----------+-------+

EVALUATION SUMMARY:
accuracy:0.9999564868024472 
precision:0.9996736292428199 
recall:0.9882239070817874 
f1-score:0.9939157945972257


In [None]:
# todo: implement area under ROC 

# predictionAndLabels = prediction.select('prediction','label').rdd \
#                             .map(lambda x: (x[0],x[1]))


# # Instantiate metrics object
# metrics = BinaryClassificationMetrics(predictionAndLabels)
# # Area under ROC curve
# print("Area under ROC = %s" % metrics.areaUnderROC)

### Cross Validation 

In [46]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [None]:
lr_cs = LogisticRegression()

In [None]:
from pyspark.ml.classification import LogisticRegression
lr_cs = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr_cs.regParam, [0.1, 0.01]) \
    .addGrid(lr_cs.fitIntercept, [False, True])\
    .addGrid(lr_cs.elasticNetParam, [0.0, 1.0])\
    .build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr_cs, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(train)
lrprediction=cvModel.transform(test)



print('Accuracy:', evaluator.evaluate(lrprediction))
print('AUC:', BinaryClassificationMetrics(lrprediction['label','prediction'].rdd).areaUnderROC)

In [None]:
evaluate(lrprediction)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier()
grid = ParamGridBuilder() \
        .addGrid(dt.maxDepth,  [2, 5, 10, 20, 30]) \
        .addGrid(dt.maxBins,  [10, 20, 40, 80, 100]) \
        .build()
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=dt, 
                    estimatorParamMaps=grid, 
                    evaluator=dtevaluator,
                    numFolds = 3)
dtModel = cv.fit(train)
dtpredictions = dtModel.transform(test)

print('Accuracy:', dtevaluator.evaluate(dtpredictions))
print('AUC:', BinaryClassificationMetrics(dtpredictions['label','prediction'].rdd).areaUnderROC)

In [None]:
evaluate(dtpredictions)