### Setup

In [1]:
from pathlib import Path

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance, plot_partial_dependence
from sklearn.svm import SVC
import joblib

### Titanic

In [None]:
DATA_PATH = Path.home() / "ml_data" / "titanic"
features = ["Pclass", "Sex", "SibSp", "Parch"]

In [None]:
df = pd.read_csv(DATA_PATH / "train.csv")
n = int(0.8 * df.shape[0])
train = df[:n]
test = df[n:]

In [None]:
train_prepared = pd.concat([pd.get_dummies(train[features]), train["Survived"]], axis=1)
train_prepared.to_csv(DATA_PATH / "train_prepared.csv", index=False)

In [None]:
test_prepared = pd.concat([pd.get_dummies(test[features]), test["Survived"]], axis=1)
test_prepared.to_csv(DATA_PATH / "test_prepared.csv", index=False)

In [None]:
X = train_prepared.drop(columns="Survived")
y = train_prepared["Survived"]

In [None]:
# model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model = SVC(probability=True)
model = model.fit(X, y)

In [None]:
joblib.dump(model, DATA_PATH / "titanic_model.joblib")

### Content Automation

In [4]:
DATA_PATH = Path("/mnt/c/Users/steffen.bunzel/Desktop/why/content-automation/")
TARGET = "Strukturzuweisungen (eClass 11.0)"

In [5]:
train = pd.read_csv(DATA_PATH / "train.csv")
test = pd.read_csv(DATA_PATH / "test.csv")

In [6]:
X_train, y_train = train.drop(columns=TARGET), train[TARGET]
X_test, y_test = test.drop(columns=TARGET), test[TARGET]

In [7]:
model = RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=23)

In [8]:
model = model.fit(X_train, y_train)

In [9]:
model.score(X_test, y_test)

0.9792817679558011

In [10]:
train_preds = model.predict(X_test)

In [12]:
train_preds_proba = model.predict_proba(X_test)

In [19]:
model.classes_[train_preds_proba.argmax(axis=1)]

array(['21-04-04-11', '21-04-42-02', '21-04-04-19', '21-04-02-08',
       '21-04-04-07', '21-04-04-07', '21-18-03-92', '21-04-42-01',
       '21-04-04-11', '21-04-04-10', '21-04-02-08', '21-18-03-92',
       '21-04-42-02', '21-02-90-90', '21-04-42-01', '21-04-04-19',
       '21-04-04-11', '21-04-02-22', '21-04-04-07', '21-04-04-90',
       '21-04-04-07', '21-04-42-02', '21-04-42-02', '21-04-04-22',
       '21-04-04-90', '21-04-04-19', '21-04-04-07', '21-04-02-08',
       '21-04-04-17', '21-04-04-07', '21-04-04-17', '21-04-42-02',
       '21-04-04-11', '21-04-02-08', '21-04-04-17', '21-04-04-17',
       '21-04-04-07', '21-04-42-02', '21-04-04-19', '21-04-04-07',
       '21-04-04-90', '21-04-04-07', '21-04-42-02', '21-04-04-90',
       '21-04-04-10', '21-04-04-17', '21-04-04-25', '21-04-02-08',
       '21-04-04-90', '21-04-42-02', '21-04-04-07', '21-04-42-01',
       '21-04-02-08', '21-04-04-11', '21-04-04-90', '21-04-42-02',
       '21-04-42-02', '21-04-04-07', '21-04-04-07', '21-04-04-

In [11]:
train_preds

array(['21-04-04-11', '21-04-42-02', '21-04-04-19', '21-04-02-08',
       '21-04-04-07', '21-04-04-07', '21-18-03-92', '21-04-42-01',
       '21-04-04-11', '21-04-04-10', '21-04-02-08', '21-18-03-92',
       '21-04-42-02', '21-02-90-90', '21-04-42-01', '21-04-04-19',
       '21-04-04-11', '21-04-02-22', '21-04-04-07', '21-04-04-90',
       '21-04-04-07', '21-04-42-02', '21-04-42-02', '21-04-04-22',
       '21-04-04-90', '21-04-04-19', '21-04-04-07', '21-04-02-08',
       '21-04-04-17', '21-04-04-07', '21-04-04-17', '21-04-42-02',
       '21-04-04-11', '21-04-02-08', '21-04-04-17', '21-04-04-17',
       '21-04-04-07', '21-04-42-02', '21-04-04-19', '21-04-04-07',
       '21-04-04-90', '21-04-04-07', '21-04-42-02', '21-04-04-90',
       '21-04-04-10', '21-04-04-17', '21-04-04-25', '21-04-02-08',
       '21-04-04-90', '21-04-42-02', '21-04-04-07', '21-04-42-01',
       '21-04-02-08', '21-04-04-11', '21-04-04-90', '21-04-42-02',
       '21-04-42-02', '21-04-04-07', '21-04-04-07', '21-04-04-

In [None]:
imp = model.feature_importances_
imp_relative = imp / imp.sum()

In [None]:
top_feats = imp.argsort()[-20:]

In [None]:
X_train_filtered = X_train.iloc[:, top_feats]
X_test_filtered = X_test.iloc[:, top_feats]

In [None]:
X_train_filtered.assign(**{TARGET: y_train}).to_csv(DATA_PATH / "train_filtered.csv", index=False)
X_test_filtered.assign(**{TARGET: y_test}).to_csv(DATA_PATH / "test_filtered.csv", index=False)

In [None]:
model_filtered = RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=23)
model_filtered = model_filtered.fit(X_train_filtered, y_train)

In [None]:
model_filtered.score(X_test_filtered, y_test)

In [None]:
perm_imp = permutation_importance(model_filtered, X_test_filtered, y_test, n_jobs=-1, scoring="accuracy")

In [None]:
pd.DataFrame({"feature": X_train_filtered.columns, "perm_imp": perm_imp["importances_mean"]}).sort_values("perm_imp", ascending=False)

In [None]:
plot_partial_dependence(model_filtered, X_test_filtered, features=["bits"], feature_names=X_test_filtered.columns, target="21-02-90-90")