# Klassifikations-Modell

In [None]:
!pip install ucimlrepo

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import DecisionBoundaryDisplay
sns.set_theme()

## Datengrundlage

In [None]:
data_raw = pd.DataFrame({
    'temperature': [2, 3, 5, 5, 8, 9, 10, 11, 11, 14, 16, 18, 20, 20, 21, 21, 22, 26, 28, 29],
    'humidity': [0.01, 0.92, 0.74, 0.96, 0.01, 0.22, 0.48, 0.98, 0.79, 0.81, 0.13,
       0.85, 0.94, 0.41, 0.23, 0.39, 0.15, 0.57, 0.87, 0.36],
    'pollen': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1]
})

In [None]:
## Titanic Datenset (Binary Classification)
## https://www.kaggle.com/c/titanic/data
#data_raw = sns.load_dataset('titanic')

In [None]:
## Pilz Datenset (Binary Classification)
## https://archive.ics.uci.edu/dataset/73/mushroom
#mushroom = fetch_ucirepo(id=73)
#data_raw = mushroom.data.features.join(mushroom.data.targets)

In [None]:
## Einkommen Dataset (Multiclass Classification)
## https://archive.ics.uci.edu/dataset/2/adult
#adult = fetch_ucirepo(id=2)
#data_raw = adult.data.features.join(adult.data.targets).sample(5000, random_state=42)

In [None]:
## Wein Dataset (Multiclass Classification)
## https://archive.ics.uci.edu/dataset/109/wine
#wine = fetch_ucirepo(id=109)
#data_raw = wine.data.features.join(wine.data.targets)

## Daten-Visualisierung

In [None]:
data_raw

In [None]:
data_raw.dtypes

In [None]:
sns.scatterplot(data_raw, x='humidity', y='temperature', hue='pollen')

In [None]:
sns.catplot(data_raw, x='pollen', y='temperature', kind='bar')

## Preprocessing

In [None]:
#encoder = OneHotEncoder(sparse_output=False, drop="if_binary")
#encoded_values = encoder.fit_transform(data_raw[['categorisches_feature_1', 'categorisches_feature_2']])
#encoded_values_df = pd.DataFrame(encoded_values, columns=encoder.get_feature_names_out())

In [None]:
#encoded_values_df

In [None]:
data_preprocessed = data_raw#.join(encoded_values_df)

In [None]:
feature_columns = ['temperature', 'humidity']
feature_matrix = data_preprocessed[feature_columns]

target_column = 'pollen'
target = data_preprocessed[target_column]

In [None]:
# Falls fehlende Werte (Null, NA,...) vorhanden sind
imputer = SimpleImputer(strategy='mean')
imputed_values = imputer.fit_transform(feature_matrix)
feature_matrix = pd.DataFrame(imputed_values, columns=imputer.get_feature_names_out())

In [None]:
feature_matrix

In [None]:
target

## Train-Test-Split

In [None]:
train_feature_matrix, test_feature_matrix, train_y, test_y = train_test_split(feature_matrix, target, test_size=0.4, random_state=3)

## Modell-Training

In [None]:
model = LogisticRegression(penalty=None)
#model = DecisionTreeClassifier()
model.fit(train_feature_matrix, train_y)

## Modell-Evaluation auf Testset

In [None]:
model.predict_proba(test_feature_matrix)

In [None]:
predictions = model.predict(test_feature_matrix)

In [None]:
accuracy_score(test_y, predictions)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(test_y, predictions))
disp.plot()
plt.grid(False)

## Modell-Analyse

### Logistische Regression

In [None]:
# Wie wichtig sind einzelne Features?
feature_importance = pd.DataFrame({
    'importance': abs(model.coef_[0] * feature_matrix.std().values)
}, index=model.feature_names_in_)
feature_importance.sort_values(by='importance').plot.barh()

In [None]:
# Was für einen Einfluss haben einzelne Features?
pd.DataFrame({'Koeffizient (Standardisiert)': model.coef_[0] * feature_matrix.std().values}, index=model.feature_names_in_) \
    .sort_values(by='Koeffizient (Standardisiert)').plot.barh()

In [None]:
# Entscheidungs-Gerade (Nur für Beispiel-Datensets)
a = -model.coef_[0, 1] / model.coef_[0, 0]
b = -model.intercept_[0] / model.coef_[0, 0]

sns.scatterplot(train_feature_matrix, x='humidity', y='temperature', hue=train_y)
#sns.scatterplot(test_feature_matrix, x='humidity', y='temperature', hue=test_y)
x_vals = np.linspace(0,1, 10)
plt.plot(x_vals, a*x_vals + b)

### Decision-Tree

In [None]:
# Wie wichtig sind einzelne Features?
feature_importance = pd.DataFrame({'importance': model.feature_importances_}, index=model.feature_names_in_)
feature_importance.sort_values(by='importance').plot.barh()

In [None]:
# Wie sieht der Decision-Tree aus?
fig = plt.figure(figsize=(10,6))
_ = plot_tree(model, feature_names=model.feature_names_in_, class_names=model.classes_.astype(str), filled=True)

In [None]:
tree_disp = DecisionBoundaryDisplay.from_estimator(model, train_feature_matrix)
sns.scatterplot(train_feature_matrix, y='humidity', x='temperature', hue=train_y)
#sns.scatterplot(test_feature_matrix, y='humidity', x='temperature', hue=test_y)