In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StructType,IntegerType,FloatType,BooleanType,StringType
from pyspark.sql.functions import rand
conf = SparkConf().setMaster("local[*]").setAppName("My App")
sc = SparkContext.getOrCreate(conf = conf)
sc._conf.set('spark.executor.memory','15g')\
    .set('spark.driver.memory','15g')\
        .set('spark.driver.maxResultsSize','0')
spark=SparkSession.builder\
    .appName('myApp')\
        .config("spark.driver.memory", "15g")\
            .getOrCreate()

In [2]:
def load_data(files,schema):
    df=spark.read.csv(files,header=True
                  ,schema=schema)
    return df

def load_record_linkage_data():
    schema = StructType() \
      .add("id_1",IntegerType(),True) \
      .add("id_2",IntegerType(),True) \
      .add("cmp_fname_c1",FloatType(),True) \
      .add("cmp_fname_c2",FloatType(),True) \
      .add("cmp_lname_c1",FloatType(),True) \
      .add("cmp_lname_c2",FloatType(),True) \
      .add("cmp_sex",IntegerType(),True) \
      .add("cmp_bd",IntegerType(),True) \
      .add("cmp_bm",IntegerType(),True) \
      .add("cmp_by",IntegerType(),True) \
      .add("cmp_plz",IntegerType(),True) \
      .add("is_match",BooleanType(),False)
    files=[f'./data/block_{id}.csv' for id in range(1,11)]
    return load_data(files,schema=schema)

In [3]:
df=load_record_linkage_data()

In [4]:
df.groupBy('cmp_plz').count().show()

+-------+-------+
|cmp_plz|  count|
+-------+-------+
|   null|  12843|
|      1|  31714|
|      0|5704575|
+-------+-------+



In [5]:
df.count()

5749132

In [6]:
df.printSchema()

root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: float (nullable = true)
 |-- cmp_fname_c2: float (nullable = true)
 |-- cmp_lname_c1: float (nullable = true)
 |-- cmp_lname_c2: float (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)



In [7]:
sampled_data = df.sample(False, 0.6).toPandas()

In [8]:
sampled_data.describe()

Unnamed: 0,id_1,id_2,cmp_fname_c1,cmp_fname_c2,cmp_lname_c1,cmp_lname_c2,cmp_sex,cmp_bd,cmp_bm,cmp_by,cmp_plz
count,3448832.0,3448832.0,3448238.0,62390.0,3448832.0,1467.0,3448832.0,3448371.0,3448371.0,3448371.0,3441151.0
mean,33317.63,66579.35,0.7128941,0.899316,0.3157722,0.333533,0.9549384,0.2243123,0.4889961,0.2227777,0.005514725
std,23657.4,23624.56,0.388721,0.27212,0.3343134,0.373342,0.2074394,0.4171287,0.499879,0.4161103,0.07405615
min,1.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13293.0,50049.0,0.2857143,1.0,0.1,0.090909,1.0,0.0,0.0,0.0,0.0
50%,29123.0,70669.0,1.0,1.0,0.1818182,0.166667,1.0,0.0,0.0,0.0,0.0
75%,50260.0,86471.0,1.0,1.0,0.4285714,0.428571,1.0,0.0,1.0,0.0,0.0
max,99980.0,100000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
