### Chancellor occupations
This notebook illustrates basic feature extraction techniques, as presented by:<br>
[1] P. Ristoski and H. Paulheim, “A comparison of propositionalization strategies for creating features from linked open data,” in Proceedings of the 1st International Conference on Linked Data for Knowledge Discovery - Volume 1232, Aachen, DEU, Sep. 2014, pp. 1–11.

In [70]:
import pandas as pd
import math

df = pd.read_csv("chancellor_occupations.tsv", sep="\t")

In [71]:
df.head(3)

Unnamed: 0,chancellor_name,occupation_label
0,Angela Merkel,politician
1,Angela Merkel,physicist
2,Gerhard Schröder,lawyer


In [72]:
df = df.pivot(index="chancellor_name", columns="occupation_label", values="occupation_label").fillna(0)
df[df != 0] = 1

In [73]:
# Binary feature extraction
df

occupation_label,assessor,autobiographer,civil servant,consultant,economist,historian,journalist,judge,lawyer,lobbyist,military personnel,non-fiction writer,physicist,political scientist,politician,resistance fighter,university teacher,writer
chancellor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Angela Merkel,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
Gerhard Schröder,0,0,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0
Helmut Kohl,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0
Helmut Schmidt,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1
Konrad Adenauer,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0
Kurt Georg Kiesinger,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0
Ludwig Erhard,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
Olaf Scholz,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
Walter Scheel,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
Willy Brandt,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0


In [74]:
for index, row in df.iterrows():
    df.loc[index] = row.div(row.sum())

# Relative Couunt feature extraction
df 

occupation_label,assessor,autobiographer,civil servant,consultant,economist,historian,journalist,judge,lawyer,lobbyist,military personnel,non-fiction writer,physicist,political scientist,politician,resistance fighter,university teacher,writer
chancellor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Angela Merkel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0
Gerhard Schröder,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0
Helmut Kohl,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0
Helmut Schmidt,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.2
Konrad Adenauer,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0
Kurt Georg Kiesinger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
Ludwig Erhard,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0
Olaf Scholz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
Walter Scheel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0
Willy Brandt,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0


TF-IDF, according to [1]:
$$\frac{1}{n} \cdot{} log(\frac{N}{|{r|C(r)}|})$$

$N$ is the total number of resources (chancellors).<br>
$|{r|C(r)}|$ is the number of resources that share the relation (occupation) r.

In [75]:
N = len(df)

In [76]:
for occupation in df:
    multiplicity = df[occupation].gt(0).sum()
    df[occupation] = df[occupation].apply(lambda v: v * math.log(N/multiplicity))

In [77]:
df

occupation_label,assessor,autobiographer,civil servant,consultant,economist,historian,journalist,judge,lawyer,lobbyist,military personnel,non-fiction writer,physicist,political scientist,politician,resistance fighter,university teacher,writer
chancellor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Angela Merkel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.151293,0.0,0.0,0.0,0.0,0.0
Gerhard Schröder,0.0,0.0,0.0,0.460517,0.0,0.0,0.0,0.0,0.183258,0.460517,0.0,0.240795,0.0,0.0,0.0,0.0,0.0,0.0
Helmut Kohl,0.0,0.0,0.0,0.0,0.0,0.767528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.767528,0.0,0.0,0.0,0.0
Helmut Schmidt,0.0,0.0,0.460517,0.0,0.321888,0.0,0.0,0.0,0.0,0.0,0.0,0.240795,0.0,0.0,0.0,0.0,0.0,0.460517
Konrad Adenauer,0.383764,0.26824,0.0,0.0,0.0,0.0,0.0,0.26824,0.152715,0.0,0.0,0.0,0.0,0.0,0.0,0.383764,0.0,0.0
Kurt Georg Kiesinger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.536479,0.30543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ludwig Erhard,0.0,0.0,0.0,0.0,0.536479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.767528,0.0
Olaf Scholz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Walter Scheel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.151293,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Willy Brandt,0.0,0.402359,0.0,0.0,0.0,0.0,0.575646,0.0,0.0,0.0,0.0,0.300993,0.0,0.0,0.0,0.0,0.0,0.0
