# Einführung KI
## 02.12.2022
## Mathias Anneken

# Agenda
Was ist Data Science?

Einführung in Python

Clustering

Klassifikation

Regression

# Was ist Data Science?

Extration von Wissen aus Daten

Verwendung von
* Visualisierung
* Statistisches Lernen
* Wahrscheinlichkeitsmodellen
* Maschinenlernen
* Mustererkennung
* Prognosen
* ...

<img style="float: right" src=".\img\CRISP-DM.png"/>
<div style="float:left; width: 50%">
    
# Cross Industry Standard Process for Data Mining
* Business Understanding
 * Projektziele aus Geschäftsperspektive analysieren
* Data Understanding
 * Qualitätsprobleme identifizieren
 * Subsets finden
 * Hypothesen zu versteckten Informationen aufstellen
</div>


* Data Preparation
 * Attribute und Transformationen finden um Merkmale für die Modelle zu extrahieren
* Modeling
 * Modelle aussuchen
 * Modelle optimieren
* Evaluation
 * Qualität der Modelle überprüfen
 * Mit Projektzielen abgleichen
* Deployment


<img style="float: right" src="./img/python_logo.png">

# Python
* Skript Sprache
* Verschiedene Programmierparadigmen
 * objektorientiert
 * funktional
* Dynamische Typisierung


# Hello World!

In [171]:
print('Hello DHBW!')

Hello DHBW!


# Formatierung von Strings

In [172]:
print('foo' + 'bar')
print('foo', 'bar')
print('foo=' + str(1))
print('foo=', 1)

foobar
foo bar
foo=1
foo= 1


## besser

In [173]:
print('foo={}'.format(1))

foo=1


# Kontrollstrukturen
## if, else und elif

In [174]:
a = False
if a:
    print('a ist True')
else:
    print('a is False')

a is False


In [175]:
a = ['sadf', 'sadasf']
if a == 10:
    print('a gleich 0')
elif a == 1:
    print('a gleich 1')
else:
    print('a ist gleich ' + str(a))

a ist gleich ['sadf', 'sadasf']


# Kontrollstrukturen
## while und for Schleifen

In [176]:
for i in range(1,2):
    print(i)

1


In [177]:
i = 0
while True:
    if i == 5:
        print('Das war es mit der Schleife')
        break
    i += 1
    

Das war es mit der Schleife


# Listen

In [178]:
l = [i for i in range(3, 5)]
print(l)

[3, 4]


In [179]:
l.append([1, 2, 3])
print(l)
l.extend([1, 2, 3])
print(l)

[3, 4, [1, 2, 3]]
[3, 4, [1, 2, 3], 1, 2, 3]


In [180]:
print(l)
print(l[2][2])

[3, 4, [1, 2, 3], 1, 2, 3]
3


# numpy Arrays

In [181]:
import numpy as np
arr = np.zeros((2, 3))
print(arr)

[[0. 0. 0.]
 [0. 0. 0.]]


In [182]:
arr[0, 1] = 5
print(arr)
print('größter Wert: {}'.format(np.amax(arr)))

[[0. 5. 0.]
 [0. 0. 0.]]
größter Wert: 5.0


# Coole Features

## enumerate

In [183]:
l = ['a', 'b', 'c']
for i, v in enumerate(l):
    print('#', i, ' = ' + v)

# 0  = a
# 1  = b
# 2  = c


## zip

In [184]:
values = np.arange(3)
colors = ['#ffffff', '#ff0000', '#0000ff']
for v, c in zip(values, colors):
    print(str(v) + ' = ' + str(c)) 

0 = #ffffff
1 = #ff0000
2 = #0000ff


# itertools

In [185]:
from itertools import combinations, product, permutations
for val in product(range(2), range(10, 12)):
    print(val)

(0, 10)
(0, 11)
(1, 10)
(1, 11)


In [186]:
for val in combinations(range(3), 2):
    print(val)

(0, 1)
(0, 2)
(1, 2)


In [187]:
for val in permutations(range(3), 2):
    print(val)

(0, 1)
(0, 2)
(1, 0)
(1, 2)
(2, 0)
(2, 1)


# Matplotlib

In [188]:
%matplotlib notebook
import matplotlib.pyplot as plt
x = np.arange(0, 10, .25)
[y1, y2] = [np.sin(x), np.sin(x)**2]
plt.figure(figsize=(12,4.5))
plt.plot(x, y1, 'y-o', linewidth=5)
plt.plot(x, y2, 'b--.')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x2b8181ddd48>]

# scikit-learn
Verfahren zur
* Regressionen
* Klssifikationen
* Clustering
* Vorverarbeitung
* Datenreduktion

In [189]:
import sklearn

http://scikit-learn.org/stable/

# Wann verwendet man welches Verfahren?

<a href="http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html"><img src="img/ml_map.png"></a>

# Clustering

In [190]:
from sklearn.cluster import KMeans, DBSCAN

KMeans

DBScan

# KMeans
Einteilung in $k$ Partitionen/Cluster

Minimierung des quadratischen Abstands zu den Cluster-Schwerpunkten

Optimierung von

$$J = \sum^{k}_{i=1} \sum_{x_j\in S_i} \|x_j-\mu_i\|^2$$

mit $$\mu_i, i=1,\dots,k$$ als Schwerpunkte der Cluster

In [191]:
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs

# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# Compute clustering with Means
k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means.fit(X)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)


In [196]:
plt.figure(figsize=(12,4.5))
ax = plt.axes()
for k, col in zip(range(n_clusters), colors):
    mask = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    ax.plot(X[mask, 0], X[mask, 1], '.w', markerfacecolor=col)
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, 
            markeredgecolor='k', markersize=6)

<IPython.core.display.Javascript object>

In [200]:
plt.figure(figsize=(12,4.5))
ax = plt.axes()
for k, col in zip(range(n_clusters), colors):
    mask = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    ax.plot(X[mask, 0], X[mask, 1], '.w', markerfacecolor=col)
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, 
            markeredgecolor='k', markersize=6)

h=.02
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.8)

<IPython.core.display.Javascript object>

<matplotlib.contour.QuadContourSet at 0x2b81c534b08>

<img src="img/dbscan.png" style="float: right">

# DBSCAN
Dichtebasierend

Rauschen bzw. Anomalien werden erkannt

Es wird zwischen
* Kernobjekten,
* Dichte-erreichbare Objekten (Randpunkte) und
* Rauschpunkte

unterschieden


In [None]:
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
plt.figure(figsize=(12, 4.5))
for l in np.unique(db.labels_):
    plt.plot(X[db.labels_==l, 0], X[db.labels_==l, 1], 'o', label='anomaly' if l == -1 else l)
plt.legend()

# Klassifizierung

<img src="img/flower.png" style="float:right">
<img src="img/iris.png" style="float:right; clear: right">

# Beispielproblem - Iris Datensatz
Datensatz zur Charakterisierung von Lilien
* Iris setosa
* Iris versicolor
* Iris virginica

Im Datensatz vorhanden:
* Kelchblatt Länge und Breite in cm
* Kronblatt Länge und Breite in cm

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
f, axes = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(12,4.5))
names = ['Sepal length', 'Sepal width', 'Petal length', 'Petal width']
for val, ax in zip(combinations(zip(range(4), names), 2), [(0,0), (1,0), (2,0), (1, 1), (2,1), (2,2)]):
    for t, c in zip(np.unique(iris.target), colors):
        axes[ax].scatter(iris.data[iris.target==t, val[0][0]], iris.data[iris.target==t, val[1][0]], c=c)
    if val[0][1] == 'Sepal length': axes[ax].set_ylabel(val[1][1])
    if val[1][1] == 'Petal width': axes[ax].set_xlabel(val[0][1])

In [None]:
k_means = KMeans(n_clusters=3, n_init=10)
k_means.fit(iris.data)
f, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(12, 4.5))
ax[0].set_title('Echte Label')
ax[1].set_title('Geschaetzte Label')
for k, col in zip(range(n_clusters), colors):
    ax[0].plot(iris.data[iris.target == k, 0], iris.data[iris.target == k, 1], 'ow', markerfacecolor=col, markersize=4)
    ax[1].plot(iris.data[k_means.labels_ == k, 0], iris.data[k_means.labels_ == k, 1], 'ow', markerfacecolor=col, markersize=4)

<img src="img/svm.png" style="float: right">

# Support Vector Machine
Hyperebene zwischen Datensätzen

Ebene so gewählt, dass möglichst großer Bereich frei von Objekten

## Problem
Annotierte Daten werden benötigt

Daten müssen linear separierbar sein $\rightarrow$ Einführung von Kernelfunktionen

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC()
id1 = 0; id2 = 1; h = 0.01
clf.fit(iris.data[:, [id1, id2]], iris.target)  
x_min, x_max = iris.data[:, id1].min() - 1, iris.data[:, id1].max() + 1
y_min, y_max = iris.data[:, id2].min() - 1, iris.data[:, id2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
y = clf.predict(iris.data[:, [id1, id2]])
plt.figure(figsize=(8, 4.5))
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(iris.data[:, id1], iris.data[:, id2], c=iris.target)

In [None]:
clf = svm.SVC(C=1)
clf.fit(iris.data, iris.target)  
x_min, x_max = iris.data[:, id1].min() - 1, iris.data[:, id1].max() + 1
y_min, y_max = iris.data[:, id2].min() - 1, iris.data[:, id2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
y = clf.predict(iris.data)

In [None]:
plt.figure(figsize=(8, 4.5))
id1 = 0; id2 = 1
plt.scatter(iris.data[iris.target == y, id1], iris.data[iris.target == y, id2], c='g')
plt.scatter(iris.data[iris.target != y, id1], iris.data[iris.target != y, id2], c='r')

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, iris.data, iris.target, display_labels=['Iris setosa', 'Iris versicolor', 'Iris virginica'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(iris.target, y, target_names=['Iris setosa', 'Iris versicolor', 'Iris virginica']))

# Pandas

In [None]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=names)
print(df.head(10))
df.describe()

In [None]:
pd.plotting.scatter_matrix(df, alpha=0.2)

# Was lief hier bisher falsch?

Es wurden alle Daten zum Trainieren und Validieren verwendet!

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.5, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Regression
z.B. RANSAC (RANdom SAmple Consensus)

In [None]:
from sklearn import linear_model, datasets
n_samples = 1000; n_outliers = 50
X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1, n_informative=1, noise=10, coef=True, random_state=0)
X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1))
y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers)
plt.figure(figsize=(8, 4.5))
plt.plot(X, y, '.b')

In [None]:
model = linear_model.LinearRegression().fit(X, y)
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression()).fit(X, y)
line_X = np.arange(-5, 5)
line_y = model.predict(line_X[:, np.newaxis])
line_y_ransac = model_ransac.predict(line_X[:, np.newaxis])
plt.figure(figsize=(8, 4.5))
plt.plot(X[model_ransac.inlier_mask_], y[model_ransac.inlier_mask_], '.g', label='Inliers')
plt.plot(X[np.logical_not(model_ransac.inlier_mask_)], y[np.logical_not(model_ransac.inlier_mask_)], '.r', label='Outliers')
plt.plot(line_X, line_y, '-k', label='Linear regressor')
plt.plot(line_X, line_y_ransac, '-b', label='RANSAC regressor')
plt.legend(loc='lower right')

# Dimensionsreduktion

Hauptkomponentenanalyse (engl. Principal Component Analysis PCA)



In [None]:
glass = np.loadtxt('data/glass.data', delimiter=',', dtype=[('id', np.int), ('RI', np.float), ('Na', np.float), ('Mg', np.float), ('Al', np.float), ('Si', np.float), ('K', np.float), ('Ca', np.float), ('Ba', np.float), ('Fe', np.float), ('type', np.int)])
plt.figure(figsize=(12, 4.5))
plt.plot(glass['Si'], glass['Al'], 'ob')

In [None]:
from sklearn.decomposition import PCA
glass = np.loadtxt('data/glass.data', delimiter=',')
pca = PCA()
pca.fit(glass[:, 1:-2])
print(pca.explained_variance_ratio_)
from itertools import accumulate
print([i for i in accumulate(pca.explained_variance_ratio_)])
print(pca.components_[:4, :])