In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from tqdm.notebook import tqdm

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import PassiveAggressiveClassifier
import catboost

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_pickle('crowd_train_all_data_embedded.pkl')

In [3]:
def get_dataframe(df, min_count, max_count):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = df_temp.groupby('source_id').apply(
        lambda x: x.nlargest(max_count, 'duration')
    ).reset_index(drop=True).drop(columns=['duration', 'hash_id', 'annotator_emo', 'golden_emo', 'annotator_id', 'speaker_text', 'speaker_emo'])

    df_temp.dropna(inplace=True)
    
    print(df_temp.source_id.nunique())
    
    return df_temp

In [4]:
df_t = get_dataframe(df, 100, 10)

354


In [4]:
df_t = get_dataframe(df, 200, 10)

102


In [5]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X.shape, y.shape

((1020, 498), (1020,))

In [30]:
X = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
X_train.shape, X_test.shape

((510, 60), (510, 60))

In [32]:
len(y_train[y_train==4]), len(y_test[y_test==4])

(5, 5)

## Метод опорных векторов

In [33]:
from sklearn.svm import SVC

In [34]:
model_params = {
    'kernel': 'linear',
    "decision_function_shape": 'ovr',
    'C': 1
    }
model_svc = SVC(**model_params)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_svc.fit(X_train_scaled, y_train)
predicts = model_svc.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_svc.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

1.0

0.8530271471447942
0.8549019607843138
0.8549019607843137


In [10]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "svc normilized linear kernel"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
    'kernel': 'linear',
    "decision_function_shape": 'ovr',
    'C': 1
    }
    model_svc = SVC(**model_params)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_svc.fit(X_train_scaled, y_train)
    predicts = model_svc.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_svc.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_svc, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )



## KNN

In [35]:
from sklearn.neighbors import KNeighborsClassifier

In [36]:
model_params = {
    'n_neighbors': 1,
    'algorithm': 'auto',
    'weights': 'uniform',
    'p': 1,
    }
model_knn = KNeighborsClassifier(**model_params)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# X_train_scaled = X_train
# X_test_scaled = X_test

model_knn.fit(X_train_scaled, y_train)
predicts = model_knn.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_knn.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

1.0

0.8208253892077423
0.8254901960784313
0.8254901960784313


In [13]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "knn normilized manhattan"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
    'n_neighbors': 1,
    'algorithm': 'auto',
    'weights': 'uniform',
    'p': 1,
    }
    model_knn = KNeighborsClassifier(**model_params)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_knn.fit(X_train_scaled, y_train)
    predicts = model_knn.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_knn.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_knn, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## Наивный баес

In [37]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

In [38]:
model_params = {
    "var_smoothing": 0.068
    }
model_nb = GaussianNB(**model_params)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# X_train_scaled = X_train
# X_test_scaled = X_test

model_nb.fit(X_train_scaled, y_train)
predicts = model_nb.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_nb.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

0.9980194097841157

0.8136265471040558
0.8196078431372549
0.8196078431372549


In [160]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "naive bayes manhattan"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "priors": [1/102]*102,
        "var_smoothing": 0.068
        }
    model_nb = GaussianNB(**model_params)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_nb.fit(X_train_scaled, y_train)
    predicts = model_nb.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_nb.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_nb, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## Случайный лес

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
model_params = {
    # 'max_depth': None,
    "min_samples_split": 3,
    "min_samples_leaf": 1,
    # "max_samples": None,
    "random_state": 42,
    "warm_start": True
    }
model_dt = RandomForestClassifier(**model_params)
scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
X_train_scaled = X_train
X_test_scaled = X_test

model_dt.fit(X_train_scaled, y_train)
predicts = model_dt.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_dt.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

1.0

0.773562766209825
0.7921568627450981
0.792156862745098


In [223]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "random forest"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
    # 'max_depth': None,
    "min_samples_split": 3,
    "min_samples_leaf": 1,
    # "max_samples": None,
    "random_state": 42,
    "warm_start": True
    }
    model_rand_f = RandomForestClassifier(**model_params)

    scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)
    X_train_scaled = X_train
    X_test_scaled = X_test

    model_rand_f.fit(X_train_scaled, y_train)
    predicts = model_rand_f.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_rand_f.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_rand_f, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## Логистическая регрессия

In [8]:
import mlflow
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression

In [None]:
[:, :36], [:, 36: 36 + 384], [:, 36 + 384: 36 + 384 + 60], [:, 36 + 384 + 60:]

In [26]:
X_train_scaled[:, 36 + 384 + 60: 36 + 384 + 60 + 18].shape

(510, 18)

In [41]:
model_params = {
        "max_iter": 10_000,
        "solver": "newton-cg"
    }
model_lr = LogisticRegression(**model_params)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_lr.fit(X_train_scaled, y_train)
predicts = model_lr.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_lr.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

1.0

0.842693961811609
0.8470588235294118
0.8470588235294118


In [57]:
model_params = {
        "max_iter": 10_000,
        "solver": "newton-cg",
        'penalty': 'l2',
    }
model_lr = LogisticRegression(**model_params)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

param_grid = {
    'C': np.arange(50, 121, 10)
}

grid_search = GridSearchCV(model_lr, param_grid, cv=5, scoring='f1_weighted', verbose=2)

grid_search.fit(X_scaled, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...............................................C=50; total time=   0.7s
[CV] END ...............................................C=50; total time=   0.3s
[CV] END ...............................................C=50; total time=   0.3s
[CV] END ...............................................C=50; total time=   0.4s
[CV] END ...............................................C=50; total time=   0.3s
[CV] END ...............................................C=60; total time=   0.3s
[CV] END ...............................................C=60; total time=   0.2s
[CV] END ...............................................C=60; total time=   0.3s
[CV] END ...............................................C=60; total time=   0.3s
[CV] END ...............................................C=60; total time=   0.3s
[CV] END ...............................................C=70; total time=   0.3s
[CV] END ........................................

In [58]:
grid_search.best_params_, grid_search.best_score_

({'C': 60}, 0.8922595704948646)

In [59]:
test_accuracy = grid_search.score(X_test_scaled, y_test)
print("Точность на тестовой выборке:", test_accuracy)

Точность на тестовой выборке: 1.0


In [51]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment("354 classes, 5 per class")

run_name = "logreg normilized base"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "max_iter": 1_000,
    }
    model_lr = LogisticRegression(**model_params)
    model_lr.fit(X_train, y_train)
    predicts = model_lr.predict(X_test)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_lr.predict(X_train), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_lr, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

2024/05/04 21:46:26 INFO mlflow.tracking.fluent: Experiment with name '354 classes, 5 per class' does not exist. Creating a new experiment.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


нормализация

In [32]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "logreg normilized l2 newton-cg"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "max_iter": 1_000,
        "solver": "newton-cg",
        'penalty': 'l2',
        'C': 74,
    }
    model_lr = LogisticRegression(**model_params)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_lr.fit(X_train_scaled, y_train)
    predicts = model_lr.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_lr.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_lr, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )



In [52]:
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

0.5076827693635165
0.5203389830508475
0.5203389830508475


## Бэггинг

In [43]:
from sklearn.ensemble import BaggingClassifier

In [44]:
estimator_params = {
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    }
model_dt = DecisionTreeClassifier(**estimator_params)
bagging_params = {
        "estimator": model_dt,
        "random_state": 42,
        "verbose": 10,
        "n_jobs": -1,
        "n_estimators": 95,
        "max_samples": 1.0,
        "max_features": 0.6,
    }

bagging = BaggingClassifier(**bagging_params)
scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
X_train_scaled = X_train
X_test_scaled = X_test

bagging.fit(X_train_scaled, y_train)
predicts = bagging.predict(X_test_scaled)

print(metrics.f1_score(y_train, bagging.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    3.7s remaining:    7.4s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    3.7s remaining:    3.7s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    3.7s remaining:    1.8s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    4.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.05103659629821777s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.04702973

1.0

0.7351108042284512
0.7529411764705881
0.7529411764705882


[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.0s finished


In [281]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "bagging base decision tree"

with mlflow.start_run(run_name=run_name) as run:
    estimator_params = {
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    }
    model_dt = DecisionTreeClassifier(**estimator_params)
    bagging_params = {
            "estimator": model_dt,
            "random_state": 42,
            "verbose": 10,
            "n_jobs": -1,
            "n_estimators": 95,
            "max_samples": 1.0,
            "max_features": 0.6,
        }

    bagging = BaggingClassifier(**bagging_params)
    X_train_scaled = X_train
    X_test_scaled = X_test

    bagging.fit(X_train_scaled, y_train)
    predicts = bagging.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, bagging.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(estimator_params)
    mlflow.log_params(bagging_params)

    # mlflow.sklearn.log_model(
    #     sk_model=bagging, 
    #     input_example=X_test[:10], 
    #     artifact_path=f"mlflow/{run_name}/model"
    # )

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:   25.5s remaining:   51.2s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:   28.0s remaining:   28.0s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:   28.1s remaining:   14.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:   30.4s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.09124541282653809s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.09725856

## Категориальный бустинг

In [46]:
model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1_000,
    }
model_cb = catboost.CatBoostClassifier(verbose=10, **model_params)
scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
X_train_scaled = X_train
X_test_scaled = X_test

model_cb.fit(X_train_scaled, y_train)
predicts = model_cb.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_cb.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

Learning rate set to 0.056575
0:	learn: 4.5906121	total: 39.1ms	remaining: 39.1s
10:	learn: 4.2427916	total: 427ms	remaining: 38.4s
20:	learn: 3.9355773	total: 813ms	remaining: 37.9s
30:	learn: 3.6497056	total: 1.18s	remaining: 37s
40:	learn: 3.3659098	total: 1.55s	remaining: 36.2s
50:	learn: 3.1430743	total: 1.9s	remaining: 35.4s
60:	learn: 2.8895484	total: 2.3s	remaining: 35.5s
70:	learn: 2.6507566	total: 2.69s	remaining: 35.2s
80:	learn: 2.4154144	total: 3.1s	remaining: 35.2s
90:	learn: 2.2270211	total: 3.48s	remaining: 34.8s
100:	learn: 2.0283282	total: 3.89s	remaining: 34.7s
110:	learn: 1.8484008	total: 4.31s	remaining: 34.6s
120:	learn: 1.6879849	total: 4.72s	remaining: 34.3s
130:	learn: 1.5397555	total: 5.14s	remaining: 34.1s
140:	learn: 1.4058073	total: 5.56s	remaining: 33.9s
150:	learn: 1.2786311	total: 5.96s	remaining: 33.5s
160:	learn: 1.1653281	total: 6.39s	remaining: 33.3s
170:	learn: 1.0672422	total: 6.8s	remaining: 33s
180:	learn: 0.9813127	total: 7.23s	remaining: 32.7s


In [54]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment("354 classes, 5 per class")

run_name = "catboost base"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1000,
    }
    model_cb = catboost.CatBoostClassifier(verbose=10, **model_params)
    model_cb.fit(X_train, y_train)
    predicts = model_cb.predict(X_test)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_cb.predict(X_train), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_cb, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

Learning rate set to 0.072924
0:	learn: 5.8082980	total: 829ms	remaining: 13m 48s
10:	learn: 5.2715373	total: 7.99s	remaining: 11m 58s
20:	learn: 4.7758899	total: 14.9s	remaining: 11m 34s
30:	learn: 4.3018111	total: 22s	remaining: 11m 26s
40:	learn: 3.8817046	total: 28.9s	remaining: 11m 17s
50:	learn: 3.5263277	total: 35.9s	remaining: 11m 8s
60:	learn: 3.1635171	total: 43.2s	remaining: 11m 5s
70:	learn: 2.8722896	total: 50.6s	remaining: 11m 1s
80:	learn: 2.5884051	total: 57.9s	remaining: 10m 56s
90:	learn: 2.3457448	total: 1m 5s	remaining: 10m 53s
100:	learn: 2.1352517	total: 1m 12s	remaining: 10m 43s
110:	learn: 1.9470644	total: 1m 19s	remaining: 10m 33s
120:	learn: 1.7526676	total: 1m 26s	remaining: 10m 26s
130:	learn: 1.5830097	total: 1m 33s	remaining: 10m 18s
140:	learn: 1.4335409	total: 1m 40s	remaining: 10m 12s
150:	learn: 1.3047540	total: 1m 47s	remaining: 10m 6s
160:	learn: 1.1937150	total: 1m 55s	remaining: 9m 59s
170:	learn: 1.0962493	total: 2m 2s	remaining: 9m 55s
180:	learn

In [37]:
preds = model_cb.predict(X_train)
    
print(f'F1: {metrics.f1_score(y_train, preds, average="weighted")}')
print()

F1: 1.0



In [38]:
preds = model_cb.predict(X_test)
    
print(f'F1: {metrics.f1_score(y_test, preds, average="weighted")}')
print()

F1: 0.8558466859937449



In [89]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.50      0.40      0.44         5
           2       0.60      0.60      0.60         5
           3       1.00      0.60      0.75         5
           4       0.80      0.80      0.80         5
           5       1.00      1.00      1.00         5
           6       0.75      0.60      0.67         5
           7       1.00      0.40      0.57         5
           8       0.71      1.00      0.83         5
           9       1.00      1.00      1.00         5
          10       0.83      1.00      0.91         5
          11       1.00      0.80      0.89         5
          12       1.00      1.00      1.00         5
          13       0.67      0.80      0.73         5
          14       1.00      0.80      0.89         5
          15       0.75      0.60      0.67         5
          16       1.00      1.00      1.00         5
          17       0.60    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
preds = model.predict_proba(X_test)
confidences = np.max(preds, axis=1)
pred_label = np.argmax(preds, axis=1)

In [103]:
true_conf = []
false_conf = []
for i, (conf, pred_) in enumerate(zip(confidences, pred_label)):
    if y_test[i] != pred_:
        false_conf.append(conf)
    else:
        true_conf.append(conf)

In [104]:
sum(true_conf)/len(true_conf), sum(false_conf)/len(false_conf)

(0.493037544467759, 0.17304315771004875)

In [99]:
metrics.f1_score(y_test, pred_label, average="weighted")

0.7986898353299151

In [90]:
model.save_model("catboost_5_score0_8")

In [31]:
model = catboost.CatBoostClassifier()

model.load_model("catboost_20_score0_933")

<catboost.core.CatBoostClassifier at 0x1d3c198c4c0>