# 1. Was ist KI?

### 1.1. ChatGPT

ChatGPT: “KI steht für künstliche Intelligenz, ein Bereich der Informatik, der sich darauf konzentriert, Maschinen oder Computersysteme zu erschaffen, die Aufgaben ausführen können, die normalerweise menschliche Intelligenz erfordern. Dies umfasst eine breite Palette von Fähigkeiten wie Problemlösung, Spracherkennung, Lernen, Planung und Wahrnehmung.”

### 1.2. Einordnung

<img src="ki.svg">

# 2. Regel-basierte und statistische KI

### 2.1. Prinzip EVA

<img src="eva.svg">

In [None]:
def f(x):
    return x**2


f(2)

In [None]:
%load_ext autoreload
%autoreload 1
%aimport lib
import lib

In [None]:
lib.plot_f(f)

In [None]:
def f(x):
    if x > 0:
        return x
    return x**2


lib.plot_f(f)

### 2.2. Regel-basierte "KI"

In [None]:
def bewertung(text):
    if "nicht super" in text:
        return "schlecht"
    if "super" in text:
        return "gut"
    if "nicht schlecht" in text or "nicht doof" in text:
        return "gut"
    if "schlecht" in text or "doof" in text:
        return "schlecht"
    return "unsicher"


(
    bewertung("Der Film war super."),
    bewertung("Der Film war schlecht."),
    bewertung("Der Film war nicht super."),
    bewertung("Der Film war nicht schlecht."),
    bewertung("Verrückt ist, wer sagt, der war super."),
)

### 2.3. Tensoren – Einzelwerte, Listen, Tabellen, ..., hochdimensionale Arrays

In [None]:
import torch

a = torch.tensor(9)
a, a.shape

In [None]:
a = torch.tensor([9])
a, a.shape

In [None]:
a = torch.tensor([9, 10, 11, 12])
a, a.shape

In [None]:
a = torch.tensor([[9, 10, 11, 12], [12, 13, 14, 15], [13, 14, 15, 16]])
a, a.shape

In [None]:
b = torch.stack([a, a * 2])
b, b.shape

In [None]:
c = torch.stack([b, b + 5])
c, c.shape

### 2.4. Statistische KI zur Bilderkennung – MNIST, das "Hallo Welt" des ML

In [None]:
print("Hallo Welt!")

In [None]:
from keras.datasets import mnist

(train_X, train_y), (test_X, test_y) = mnist.load_data()
(train_X, train_y, test_X, test_y) = (
    torch.tensor(train_X),
    torch.tensor(train_y),
    torch.tensor(test_X),
    torch.tensor(test_y),
)
train_X = train_X / 255
test_X = test_X / 255
train_X.shape, train_y.shape, test_X.shape, test_y.shape

In [None]:
test_X[1]

In [None]:
import matplotlib.pyplot as plt


def show(nr):
    plt.axis("off")
    plt.imshow(nr, cmap=plt.get_cmap("gray"))


for i in range(12):
    plt.subplot(3, 4, i + 1)
    show(train_X[i])
    plt.title(str(train_y[i].item()))

In [None]:
mean_3 = train_X[train_y == 3].mean(dim=0)
show(mean_3)

In [None]:
plt.subplot(1, 2, 1)
show(test_X[0])
plt.subplot(1, 2, 2)
show(test_X[0] * mean_3)

In [None]:
mean = torch.stack([train_X[train_y == i].mean(dim=0) for i in range(10)])
mean = mean / mean.sum(dim=(1, 2), keepdim=True)

for i in range(10):
    plt.subplot(3, 4, i + 1)
    show(mean[i])
    plt.title(str(i))

In [None]:
sample = 3
for i in range(10):
    plt.subplot(3, 4, i + 1)
    show((mean[i] * test_X[sample]))
    plt.title("%d: %.3f" % (i, torch.sum(mean[i] * test_X[sample])))
plt.subplot(3, 4, 12)
show(test_X[sample])
plt.title(str(test_y[sample].item()))

In [None]:
def classify_digit(sample):
    return (mean * sample).sum(axis=(1, 2))


classify_digit(test_X[3])

In [None]:
test_classification = torch.stack(
    [classify_digit(sample).argmax() for sample in test_X]
)
test_classification

In [None]:
test_y

In [None]:
test_classification == test_y

In [None]:
accuracy = (test_classification == test_y).float().mean()
accuracy

# 3. Maschinelles Lernen

### 3.1. Funktionen mit Parametern

In [None]:
def f(x, a, b, c):
    return a * x**2 + b * x + c


def f1(x):
    return f(x, 3, 2, 2)


def f2(x):
    return f(x, 2, 5, 1)


lib.plot_f(f1)
lib.plot_f(f2)

### 3.2. Gradientenverfahren

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

torch.manual_seed(1)
n = 20
y = torch.randint(0, 2, (n,))
X = torch.randn(n)
X[y == 1] += 2
graph = sns.swarmplot(x=X[:].numpy(), hue=y.numpy())
graph.axvline(x=0.5, color="red")
print("Anzahl orangene Punkte: %d" % y.sum())

In [None]:
def classify(X, threshold):
    return (X > threshold).int()


classify(X, 0.5)

In [None]:
thresholds = torch.linspace(-2, 4, 1000)
errors = [(classify(X, threshold) != y).sum().item() for threshold in thresholds]
sns.lineplot(x=thresholds.numpy(), y=errors)

In [None]:
def classify(X, threshold):
    return X - threshold


def loss(y_pred, y):
    return (y - y_pred) ** 2


loss(classify(X, 0.5), y)

In [None]:
losses = [loss(classify(X, threshold), y).mean().item() for threshold in thresholds]
sns.lineplot(x=thresholds.numpy(), y=losses)

In [None]:
# loss = (y - y_pred) ** 2
# loss = (y - X - threshold) ** 2
# loss = (z - threshold) ** 2
# loss = z**2 - 2 * z * threshold + threshold**2
# loss' = dloss/dthreshold = -2 * z + 2 * threshold
# loss' = 2 * (threshold - |z|)
# loss' = 2 * (threshold - |y - X|)


def draw_with_slope(t):
    loss_t = loss(classify(X, t), y).mean()
    slope = 2 * (t - torch.abs((y - X).mean()))
    sns.lineplot(x=thresholds.numpy(), y=losses)
    plt.plot(t, loss_t, "ro")

    def x_for_y(y):
        return (y - loss_t) / slope + t

    ts = torch.linspace(max(x_for_y(0), -2), min(x_for_y(12), 4), 100)
    slopes = slope * (ts - t) + loss_t
    plt.plot(ts, slopes, "r--")
    return slope


draw_with_slope(3)

In [None]:
t = 3

In [None]:
slope = draw_with_slope(t)
t = t - 0.1 * slope
slope, t

### 3.3. Ein einzelnes Neuron

In [None]:
input = torch.tensor([0.1, 0.2, 0.3, 0.2])
W = torch.tensor([1.0, -2.0, -1.0, 2.0])
b = torch.tensor([0.5])
input[0] * W[0] + input[1] * W[1] + input[2] * W[2] + input[3] * W[3] + b

In [None]:
W @ input + b

In [None]:
torch.relu(W @ input + b)

In [None]:
lib.plot_f(torch.relu)

In [None]:
input = torch.tensor([0.1, 0.2, 0.3, 0.2])
W = torch.tensor([[1.0, -2.0, -1.0, 2.0]])
b = torch.tensor([0.5])
lib.plot_network(input, [[W, b]])

### 3.4. Mehrere Neuronen

In [None]:
input = torch.tensor([0.1, 0.2, 0.3, 0.2])
W = torch.tensor([[1.0, -2.0, -1.0, 2.0], [2.0, -1.0, 1.0, -2.0]])
b = torch.tensor([0.5, -0.1])
lib.plot_network(input, [[W, b]])

### 3.5. Mehrere Schichten = Neuronales Netzwerk

In [None]:
W1 = torch.randn(16, 784)
b1 = torch.randn(16)
W2 = torch.randn(16, 16)
b2 = torch.randn(16)
W3 = torch.randn(10, 16)
b3 = torch.randn(10)

lib.plot_network(test_X[3], [[W1, b1], [W2, b2], [W3, b3]])

In [None]:
def forward(X):
    X = torch.relu(X @ W1.T + b1)
    X = torch.relu(X @ W2.T + b2)
    X = X @ W3.T + b3
    return X


output = forward(train_X.reshape([-1, 784]))
output

In [None]:
import torch.nn.functional as F


def loss(y_pred, y):
    return F.cross_entropy(y_pred, y)


loss(output, train_y)

In [None]:
train_losses = []
test_losses = []
test_accuracies = []
test_xs = []
W1.requires_grad = True
b1.requires_grad = True
W2.requires_grad = True
b2.requires_grad = True
W3.requires_grad = True
b3.requires_grad = True

In [None]:
j = 0
k = 0
batch_size = 256
epochs = 1
for i in range(epochs * len(train_X) // batch_size):
    W1.grad = None
    b1.grad = None
    W2.grad = None
    b2.grad = None
    W3.grad = None
    b3.grad = None

    start = j * batch_size + k
    end = (j + 1) * batch_size + k
    j = j + 1
    if ((j + 1) * batch_size + k) > len(train_X):
        j = 0
        k += 1
        if k >= batch_size:
            k = 0
    loss_value = loss(
        forward(train_X[start:end].reshape([-1, 784])), train_y[start:end]
    )
    loss_value.backward()

    lr = 0.1
    with torch.no_grad():
        W1 -= lr * W1.grad  # type: ignore
        b1 -= lr * b1.grad  # type: ignore
        W2 -= lr * W2.grad  # type: ignore
        b2 -= lr * b2.grad  # type: ignore
        W3 -= lr * W3.grad  # type: ignore
        b3 -= lr * b3.grad  # type: ignore

    train_losses.append(loss_value.item())

test_xs.append(len(train_losses) - 1)
test_losses.append(loss(forward(test_X.reshape([-1, 784])), test_y).item())
test_accuracies.append(
    (forward(test_X.reshape([-1, 784])).argmax(dim=1) == test_y).float().mean().item()
)

sns.lineplot(x=range(len(train_losses)), y=train_losses).set(yscale="log")
sns.lineplot(x=test_xs, y=test_losses)
sns.lineplot(x=test_xs, y=test_accuracies, ax=plt.twinx(), color="red")  # type: ignore
train_losses[-1], test_losses[-1], test_accuracies[-1]

In [None]:
show(test_X[3])

In [None]:
show(W1[0].reshape(28, 28).detach().numpy())

In [None]:
input = torch.zeros(784)
input.requires_grad = True
output = torch.zeros(10)
output[3] = 1
show(input.detach().reshape(28, 28))
output

In [None]:
losses = []
loss(forward(input), output)

In [None]:
for i in range(100):
    input.grad = None

    loss_value = loss(forward(input), output)
    loss_value.backward()
    losses.append(loss_value.item())

    lr = 0.0001
    with torch.no_grad():
        input -= lr * input.grad  # type: ignore

show(input.reshape(28, 28).detach().numpy())
loss(forward(input), output), forward(input), input.grad

In [None]:
test_input = test_X[1] + 1.1 * input.reshape(28, 28).detach().numpy()
show(test_input)
output = forward(test_input.reshape(784))
output, output.argmax()

# 4. GPT = Generative Pretrained Transformer

- Generative = Es wird Text erzeugt.
- Pretrained = Das Netzwerk wird mit _viel_ Text trainiert, der nicht von Menschen vorverarbeitet wurde.
- Transfomer = Eine spezielle Netzwerkarchitektur.

In [None]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")
model = AutoModelWithLMHead.from_pretrained("dbmdz/german-gpt2")

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.vocab  # type: ignore

In [None]:
input = tokenizer("Die Katze springt", return_tensors="pt")
input

In [None]:
lib.lookup_tokens(tokenizer, input.input_ids[0])

In [None]:
lib.lookup_tokens(tokenizer, tokenizer("dx1p2f1", return_tensors="pt").input_ids[0])

In [None]:
input = tokenizer("Die Katze springt", return_tensors="pt")
output = model(**input)
output.logits.shape

In [None]:
probs = output.logits[0, 2].softmax(-1)
probs

In [None]:
top_indices = probs.topk(5).indices
top_indices

In [None]:
for i in top_indices:
    print(
        "%5d – %.1f%% – %s"
        % (
            i.item(),
            probs[i].item() * 100,
            lib.lookup_tokens(tokenizer, i.item()),
        )
    )

In [None]:
def print_next_words(text):
    input = tokenizer(text, return_tensors="pt")
    output = model(**input)
    probs = output.logits[0, -1].softmax(-1)
    top_indices = probs.topk(5).indices
    for i in top_indices:
        print(
            "%5d – %5.2f%% – %s"
            % (
                i.item(),
                probs[i].item() * 100,
                lib.lookup_tokens(tokenizer, i.item()),
            )
        )


In [None]:
print_next_words("Die Katze springt")

In [None]:
print_next_words("Das Auto springt")

In [None]:
print_next_words("Der Mann ist")

In [None]:
print_next_words("Der Frau ist es")

# 5. Embeddings: Worte im Raum

<img src="embedding.svg" />

In [None]:
embeddings = model.get_input_embeddings().weight
embeddings.shape

In [None]:
def embedOf(word):
    ids = tokenizer(" " + word).input_ids
    assert len(ids) == 1
    return embeddings[ids[0]]


embedOf("König")

In [None]:
def print_most_similar(embedding):
    distances = embeddings @ embedding
    top_indices = distances.topk(5).indices
    print(top_indices)
    for i in top_indices:
        print(
            "%5d – %5.2f – %s"
            % (
                i.item(),
                distances[i].item(),
                lib.lookup_tokens(tokenizer, i.item()),
            )
        )


print_most_similar(embedOf("König"))

In [None]:
print_most_similar(embedOf("König") - embedOf("Mann") + embedOf("Frau"))

In [None]:
print_most_similar(embedOf("Berlin") - embedOf("Deutschland") + embedOf("Frankreich"))

# 6. Meilensteine

- **2012**: AlexNet gewinnt ImageNet competition
- **2013**: Reinforcement Learning spielt Atari Spiele
- **2016**: AlphaGo gewinnt gegen Lee Sedol
- **Spracherkennung**: Übergang von mehrschrittiger Verarbeitung (Phonemerkennung zu Worterkennung) zu - Ende-zu-Ende NN ML
- **NLP**: Übergang von linguistischen Regeln zu NN ML
- **2017**: Transformers für Übersetzungen
- **2018**: BERT und GPT-1
- **2018 + 2020**: AlphaFold
- **2021**: DALL-E
- **2022**: ChatGPT launch
- **2023**: GPT-4


# 7. Ausblick

<img src="benchmarks.png" />

- KI wird bei immer mehr Aufgaben
  - so gut wie ein durchschnittlicher Mensch
  - besser als ein durchschnittlicher Mensch
  - besser also der beste Mensch
- Daniel Kahnemann: System 1 und System 2 Denken
- KI arbeitet im Moment hauptsächlich wie System 1
- System 2 als Heuristik on-top, meist Baumsuche
- Menschliches Gehirn: 100 Mrd. Neuronen verbunden mit 100 Billionen Synapsen
- GPT-4: Vermutet ~1 Billion Gewichte
- AGI (Artifical General Intelligence) in den nächsten 2 - 10 Jahren

# Graph code

In [None]:
import graphviz
from IPython.display import SVG, display

dot = graphviz.Digraph()
dot.node_attr.update(shape="box")

dot.edge("KI", "Regel-basiert")
dot.edge("KI", "Statistisch")
dot.edge("KI", "Maschinelles Lernen")

dot.edge("Maschinelles Lernen", "Verstärkungslernen")
dot.edge("Maschinelles Lernen", "Unüberwachtes Lernen")
dot.edge("Maschinelles Lernen", "Überwachtes Lernen")

dot.edge("Unüberwachtes Lernen", "Clustering")
dot.edge("Unüberwachtes Lernen", "Dimensionsreduktion")

dot.edge("Überwachtes Lernen", "Entscheidungsbäume")
dot.edge("Überwachtes Lernen", "Support Vector Machines")
dot.edge("Überwachtes Lernen", "(Tiefe) Neuronale Netze")

dot.node("KI", fillcolor="lightgreen", style="filled")
dot.node("Regel-basiert", fillcolor="#fcc", style="filled")
dot.node("Statistisch", fillcolor="#fcc", style="filled")
dot.node("Maschinelles Lernen", fillcolor="lightgreen", style="filled")
dot.node("Verstärkungslernen", fillcolor="lightgray", style="filled")
dot.node("Überwachtes Lernen", fillcolor="lightgreen", style="filled")
dot.node("(Tiefe) Neuronale Netze", fillcolor="#f77", style="filled")

dot.render("ki", format="svg", cleanup=True)
display(SVG(filename="ki.svg"))

In [None]:
import graphviz
from IPython.display import SVG, display

dot = graphviz.Digraph()

dot.edge("Eingabe", "Verarbeitung")
dot.edge("Verarbeitung", "Ausgabe")

dot.node("Verarbeitung", fillcolor="#f77", style="filled", shape="box")

# from left to right
dot.graph_attr.update(rankdir="LR")

dot.render("eva", format="svg", cleanup=True)
display(SVG(filename="eva.svg"))

In [None]:
berlin = [0.1, 0.2]
deutschland = [0.2, 0.3]
paris = [0.2, 0.1]
frankreich = [0.3, 0.2]


def plot_dot(coord, label, color):
    plt.plot(coord[0], coord[1], color + "o")
    plt.text(coord[0], coord[1], label)


def plot_arrow(start, end, head_width=0.005):
    plt.arrow(
        start[0],
        start[1],
        end[0] - start[0] - head_width * 1.5,
        end[1] - start[1] - head_width * 1.5,
        head_width=head_width,
    )


plot_dot(berlin, "Berlin", "r")
plot_dot(deutschland, "Deutschland", "b")
plot_dot(paris, "Paris", "r")
plot_dot(frankreich, "Frankreich", "b")
plot_arrow(berlin, deutschland)
plot_arrow(paris, frankreich)

plt.savefig("embedding.svg")
