# Calculate the entropy

## The Credit Failure Example

In [7]:
from scipy.stats import entropy
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from helpers.helper_functions import translate_to_file_string

inputFile = translate_to_file_string("../data/credit_failure.csv")

In [8]:
def calc_bin_entropy (dataframe, label="Kreditausfall"):
    """ calculates the entropy of the given dataframe based on the given label """
    numRows= dataframe.count()
    truefalse = dataframe.groupBy(label).count()
    labelvalues = csv.select(label).dropDuplicates()
    if labelvalues.count() != 2 :
        raise Exception('infalid datafram or label')
    else : 
        labelval0 = labelvalues.collect()[0][0]
        labelval1 = labelvalues.collect()[1][0]

        return entropy([truefalse.filter(f"{label} == '{labelval0}'").select("count").collect()[0]["count"] / numRows, \
                truefalse.filter (f"{label} == '{labelval1}'").select("count").collect()[0]["count"] / numRows ], base=2)

In [9]:
spark = (SparkSession
             .builder
             .appName("Entropy")
             .getOrCreate())

csv = spark.read.option("header", "true") \
        .option("inferSchema", "true") \
        .option("delimiter", ";") \
        .csv(inputFile)
csv.show()

baseEntropy = calc_bin_entropy(csv) 
print (baseEntropy)

+---+-----------+-----------+------------+-------------+
| ID|   Kopfform|Koerperform|Koerperfarbe|Kreditausfall|
+---+-----------+-----------+------------+-------------+
|  0|Quadratisch|       Oval|       weiss|           No|
|  1|       Rund|       Oval|     schwarz|          Yes|
|  2|Quadratisch|   Rechteck|       weiss|          Yes|
|  3|Quadratisch|   Rechteck|       weiss|          Yes|
|  4|Quadratisch|   Rechteck|       weiss|          Yes|
|  5|       Rund|   Rechteck|     schwarz|           No|
|  6|Quadratisch|   Rechteck|       weiss|          Yes|
|  7|Quadratisch|       Oval|       weiss|           No|
|  8|Quadratisch|       Oval|       weiss|           No|
|  9|Quadratisch|   Rechteck|       weiss|          Yes|
| 10|Quadratisch|       Oval|       weiss|           No|
| 11|       Rund|       Oval|       weiss|          Yes|
+---+-----------+-----------+------------+-------------+

0.9798687566511527


In [10]:
kopfformRundEntropy = calc_bin_entropy(csv.filter("Kopfform == 'Rund'"))
print (kopfformRundEntropy)
kopfformQuadratischEntropy = calc_bin_entropy(csv.filter("Kopfform == 'Quadratisch'"))
print (kopfformQuadratischEntropy)

0.9182958340544894
0.9910760598382222


In [11]:
koerperFormRechteck = calc_bin_entropy(csv.filter("Koerperform == 'Rechteck'"))
print (koerperFormRechteck)
koerperFormOval = calc_bin_entropy(csv.filter("Koerperform == 'Oval'"))
print (koerperFormOval)

0.6500224216483541
0.9182958340544894


In [12]:
koerperfarbeWeiss = calc_bin_entropy(csv.filter("Koerperfarbe == 'weiss'"))
print (koerperfarbeWeiss)
koerperfarbeSchwarz = calc_bin_entropy(csv.filter("Koerperfarbe == 'schwarz'"))
print (koerperfarbeSchwarz)

0.9709505944546688
1.0


In [13]:
spark.stop()