In [42]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from tqdm.notebook import tqdm

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import PassiveAggressiveClassifier
import catboost

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

In [43]:
df = pd.read_pickle('crowd_train_all_data_embedded.pkl')

In [44]:
def get_dataframe(df, min_count, max_count):
    df_temp = df.copy()

    freq = df_temp.source_id.value_counts()
    freq = freq[min_count <= freq]
    df_temp = df_temp[df.source_id.isin(freq.index)]

    df_temp = df_temp.groupby('source_id').apply(
        lambda x: x.nlargest(max_count, 'duration')
    ).reset_index(drop=True).drop(columns=['duration', 'hash_id', 'annotator_emo', 'golden_emo', 'annotator_id', 'speaker_text', 'speaker_emo'])

    df_temp.dropna(inplace=True)
    
    print(df_temp.source_id.nunique())
    
    return df_temp

In [172]:
df_t = get_dataframe(df, 100, 10)

354


In [46]:
df_t = get_dataframe(df, 200, 10)

102


In [173]:
X = np.vstack(df_t.audio_feature.to_numpy()).reshape(df_t.shape[0] , -1)
le = LabelEncoder()
y = le.fit_transform(df_t.source_id)
X.shape, y.shape

((3540, 498), (3540,))

In [174]:
X = X[:, 36 + 384: 36 + 384 + 60] #mfcc only

In [175]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
X_train.shape, y_val.shape

((1770, 60), (1770,))

In [176]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [177]:
len(y_train[y_train==4]), len(y_val[y_val==4])

(5, 5)

## Метод опорных векторов

In [178]:
from sklearn.svm import SVC

In [98]:
model_params = {
    'kernel': 'linear',
    "decision_function_shape": 'ovr',
    "kernel": 'rbf',
    }
model_svc = SVC(**model_params)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    "decision_function_shape": ['ovr', 'ovo'],
    "kernel": ['linear', 'rbf', 'sigmoid'],
}

param_grid = {
    'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
}

grid_search = GridSearchCV(model_svc, param_grid, cv=5, scoring='f1_weighted', verbose=2)

grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_, grid_search.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=1; total time=   0.0s
[CV] END ................................................C=2; total time=   0.0s
[CV] END ................................................C=2; total time=   0.0s
[CV] END ................................................C=2; total time=   0.0s
[CV] END ................................................C=2; total time=   0.0s
[CV] END ................................................C=2; total time=   0.0s
[CV] END ................................................C=3; total time=   0.0s
[CV] END .......................................

({'C': 2}, 0.8027450980392157)

In [100]:
model_params

{'kernel': 'rbf', 'decision_function_shape': 'ovr', 'C': 2}

In [179]:
model_params = {'kernel': 'rbf', 'decision_function_shape': 'ovr', 'C': 5}
# model_params.update(**grid_search.best_params_)
model_svc = SVC(**model_params)

model_svc.fit(X_train_scaled, y_train)
predicts = model_svc.predict(X_val_scaled)

print(metrics.f1_score(y_train, model_svc.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

0.9994293214632197

0.7385117628761696
0.7446327683615818
0.7446327683615819


In [181]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"only mfcc {len(set(y))} classes")

run_name = "svc gsc"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {'kernel': 'rbf', 'decision_function_shape': 'ovr', 'C': 5}
    model_svc = SVC(**model_params)

    model_svc.fit(X_train_scaled, y_train)
    predicts = model_svc.predict(X_val_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_svc.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_val, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_val, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_val, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_svc, 
        input_example=X_val[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## KNN

In [182]:
from sklearn.neighbors import KNeighborsClassifier

In [109]:
model_params = {
    }
model_knn = KNeighborsClassifier(**model_params)

param_grid = {
    'n_neighbors': np.arange(1, 6),
    "weights": ['uniform', 'distance'],
    "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
    "p": [1, 2, 3],
}

grid_search = GridSearchCV(model_knn, param_grid, cv=5, scoring='f1_weighted', verbose=2)

grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_, grid_search.best_score_

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p=2, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=1, p

({'algorithm': 'auto', 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'},
 0.7384967320261439)

In [183]:
model_params = {
    'n_neighbors': 1,
    'algorithm': 'auto',
    'weights': 'uniform',
    'p': 1,
    }
model_knn = KNeighborsClassifier(**model_params)

model_knn.fit(X_train_scaled, y_train)
predicts = model_knn.predict(X_val_scaled)

print(metrics.f1_score(y_train, model_knn.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

1.0

0.67957924702756
0.6875706214689266
0.6875706214689266


In [184]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"only mfcc {len(set(y))} classes")

run_name = "knn normilized manhattan"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
    'n_neighbors': 1,
    'algorithm': 'auto',
    'weights': 'uniform',
    'p': 1,
    }
    model_knn = KNeighborsClassifier(**model_params)

    model_knn.fit(X_train_scaled, y_train)
    predicts = model_knn.predict(X_val_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_knn.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_val, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_val, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_val, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_knn, 
        input_example=X_val[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## Наивный баес

In [118]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

In [122]:
model_params = {
    }
model_nb = GaussianNB(**model_params)

param_grid = {
    "var_smoothing": np.linspace(0.00001, 0.2, 1_000)
}

grid_search = GridSearchCV(model_nb, param_grid, cv=5, scoring='f1_weighted', verbose=2)

grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_, grid_search.best_score_

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
[CV] END ................................var_smoothing=1e-05; total time=   0.0s
[CV] END ................................var_smoothing=1e-05; total time=   0.0s
[CV] END ................................var_smoothing=1e-05; total time=   0.0s
[CV] END ................................var_smoothing=1e-05; total time=   0.0s
[CV] END ................................var_smoothing=1e-05; total time=   0.0s
[CV] END ................var_smoothing=0.0002101901901901902; total time=   0.0s
[CV] END ................var_smoothing=0.0002101901901901902; total time=   0.0s
[CV] END ................var_smoothing=0.0002101901901901902; total time=   0.0s
[CV] END ................var_smoothing=0.0002101901901901902; total time=   0.0s
[CV] END ................var_smoothing=0.0002101901901901902; total time=   0.0s
[CV] END ................var_smoothing=0.0004103803803803804; total time=   0.0s
[CV] END ................var_smoothing=0.000

({'var_smoothing': 0.12472848848848848}, 0.7905228758169935)

In [124]:
model_params = {
    "var_smoothing": 0.12472848848848848
    }
model_nb = GaussianNB(**model_params)
scaler = StandardScaler()

model_nb.fit(X_train_scaled, y_train)
predicts = model_nb.predict(X_val_scaled)

print(metrics.f1_score(y_train, model_nb.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

0.9980194097841157

0.8266765369706547
0.8333333333333334
0.8333333333333334


In [125]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"only mfcc {len(set(y))} classes")

run_name = "naive bayes gscv"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
    "var_smoothing": 0.12472848848848848
    }
    model_nb = GaussianNB(**model_params)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X)

    model_nb.fit(X_train_scaled, y_train)
    predicts = model_nb.predict(X_val_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_knn.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_val, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_val, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_val, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_nb, 
        input_example=X_val[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## Случайный лес

In [126]:
from sklearn.ensemble import RandomForestClassifier

In [139]:
model_params = {
    "random_state": 42,
    }
model_nb = RandomForestClassifier(**model_params)

param_grid = {
    "min_samples_split": np.arange(1, 6),
    "min_samples_leaf": np.arange(1, 6),
}

grid_search = GridSearchCV(model_nb, param_grid, cv=5, scoring='f1_weighted', verbose=10)

grid_search.fit(X_train, y_train)
grid_search.best_params_, grid_search.best_score_

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5; 1/25] START min_samples_leaf=1, min_samples_split=1....................
[CV 1/5; 1/25] END min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 2/5; 1/25] START min_samples_leaf=1, min_samples_split=1....................
[CV 2/5; 1/25] END min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 3/5; 1/25] START min_samples_leaf=1, min_samples_split=1....................
[CV 3/5; 1/25] END min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 4/5; 1/25] START min_samples_leaf=1, min_samples_split=1....................
[CV 4/5; 1/25] END min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 5/5; 1/25] START min_samples_leaf=1, min_samples_split=1....................
[CV 5/5; 1/25] END min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 1/5; 2/25] START min_samples_leaf=1, min_samples_split=2....................
[C

25 fits failed out of a total of 125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Никита\AppData\Roaming\Python\Python39\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Никита\AppData\Roaming\Python\Python39\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Никита\AppData\Roaming\Python\Python39\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Никита\AppData\Roaming\Python\Python39\site-packages\sklearn\utils\_param_validati

({'min_samples_leaf': 1, 'min_samples_split': 3}, 0.7190849673202615)

In [140]:
model_params = {
    # 'max_depth': None,
    "min_samples_split": 3,
    "min_samples_leaf": 1,
    # "max_samples": None,
    "random_state": 42,
    "warm_start": True,
    "n_estimators": 50,
    }
model_dt = RandomForestClassifier(**model_params)
scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
X_train_scaled = X_train
X_test_scaled = X_val

model_dt.fit(X_train_scaled, y_train)
predicts = model_dt.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_dt.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

1.0

0.7362009450244746
0.7549019607843137
0.7549019607843137


In [223]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "random forest"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
    # 'max_depth': None,
    "min_samples_split": 3,
    "min_samples_leaf": 1,
    # "max_samples": None,
    "random_state": 42,
    "warm_start": True
    }
    model_rand_f = RandomForestClassifier(**model_params)

    scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)
    X_train_scaled = X_train
    X_test_scaled = X_test

    model_rand_f.fit(X_train_scaled, y_train)
    predicts = model_rand_f.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_rand_f.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_rand_f, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

## Логистическая регрессия

In [142]:
import mlflow
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression

In [None]:
[:, :36], [:, 36: 36 + 384], [:, 36 + 384: 36 + 384 + 60], [:, 36 + 384 + 60:]

In [166]:
model_params = {
        "max_iter": 10_000,
        "solver": "lbfgs",
        "penalty": "l2",
        "C": 1, 
    }
model_lr = LogisticRegression(**model_params)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model_lr.fit(X_train_scaled, y_train)
predicts = model_lr.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_lr.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

1.0

0.842693961811609
0.8470588235294118
0.8470588235294118


In [146]:
model_params = {
        "max_iter": 10_000,
        "solver": "newton-cg",
        'penalty': 'l2',
    }
model_lr = LogisticRegression(**model_params)

param_grid = {
    'C': np.arange(50, 121, 10)
}

grid_search = GridSearchCV(model_lr, param_grid, cv=5, scoring='f1_weighted', verbose=2)

grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...............................................C=50; total time=   0.4s
[CV] END ...............................................C=50; total time=   0.1s
[CV] END ...............................................C=50; total time=   0.1s
[CV] END ...............................................C=50; total time=   0.1s
[CV] END ...............................................C=50; total time=   0.1s
[CV] END ...............................................C=60; total time=   0.1s
[CV] END ...............................................C=60; total time=   0.1s
[CV] END ...............................................C=60; total time=   0.1s
[CV] END ...............................................C=60; total time=   0.1s
[CV] END ...............................................C=60; total time=   0.1s
[CV] END ...............................................C=70; total time=   0.1s
[CV] END ........................................

In [147]:
grid_search.best_params_, grid_search.best_score_

({'C': 50}, 0.7928104575163398)

In [149]:
model_params = {
        "max_iter": 10_000,
        "solver": "newton-cg"
    }
#model_params.update(**grid_search.best_params_)
model_lr = LogisticRegression(**model_params)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model_lr.fit(X_train_scaled, y_train)
predicts = model_lr.predict(X_test_scaled)

print(metrics.f1_score(y_train, model_lr.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

1.0

0.842693961811609
0.8470588235294118
0.8470588235294118


In [51]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment("354 classes, 5 per class")

run_name = "logreg normilized"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "max_iter": 1_000,
    }
    model_lr = LogisticRegression(**model_params)
    model_lr.fit(X_train, y_train)
    predicts = model_lr.predict(X_test)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_lr.predict(X_train), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_lr, 
        input_example=X_test[:10], 
        artifact_path=f"mlflow/{run_name}/model"
    )

2024/05/04 21:46:26 INFO mlflow.tracking.fluent: Experiment with name '354 classes, 5 per class' does not exist. Creating a new experiment.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


нормализация

In [168]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"only mfcc {len(set(y))} classes")

run_name = "logreg normilized l2 newton-cg"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "max_iter": 10_000,
        "solver": "newton-cg",
    }
    model_lr = LogisticRegression(**model_params)

    model_lr.fit(X_train_scaled, y_train)
    predicts = model_lr.predict(X_val_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_knn.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_val, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_val, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_val, predicts))
    
    mlflow.log_params(model_params)

    mlflow.sklearn.log_model(
        sk_model=model_lr, 
        input_example=X_val[:10], 
        artifact_path=f"mlflow/{run_name}/model")

In [52]:
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

0.5076827693635165
0.5203389830508475
0.5203389830508475


## Бэггинг

In [43]:
from sklearn.ensemble import BaggingClassifier

In [44]:
estimator_params = {
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    }
model_dt = DecisionTreeClassifier(**estimator_params)
bagging_params = {
        "estimator": model_dt,
        "random_state": 42,
        "verbose": 10,
        "n_jobs": -1,
        "n_estimators": 95,
        "max_samples": 1.0,
        "max_features": 0.6,
    }

bagging = BaggingClassifier(**bagging_params)
scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
X_train_scaled = X_train
X_test_scaled = X_test

bagging.fit(X_train_scaled, y_train)
predicts = bagging.predict(X_test_scaled)

print(metrics.f1_score(y_train, bagging.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_test, predicts, average="weighted"))
print(metrics.f1_score(y_test, predicts, average="micro"))
print(metrics.accuracy_score(y_test, predicts))

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    3.7s remaining:    7.4s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    3.7s remaining:    3.7s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    3.7s remaining:    1.8s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    4.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.05103659629821777s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.04702973

1.0

0.7351108042284512
0.7529411764705881
0.7529411764705882


[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.0s finished


In [281]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"{len(set(y))} classes, 5 per class")

run_name = "bagging base decision tree"

with mlflow.start_run(run_name=run_name) as run:
    estimator_params = {
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    }
    model_dt = DecisionTreeClassifier(**estimator_params)
    bagging_params = {
            "estimator": model_dt,
            "random_state": 42,
            "verbose": 10,
            "n_jobs": -1,
            "n_estimators": 95,
            "max_samples": 1.0,
            "max_features": 0.6,
        }

    bagging = BaggingClassifier(**bagging_params)
    X_train_scaled = X_train
    X_test_scaled = X_test

    bagging.fit(X_train_scaled, y_train)
    predicts = bagging.predict(X_test_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, bagging.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_test, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_test, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_test, predicts))
    
    mlflow.log_params(estimator_params)
    mlflow.log_params(bagging_params)

    # mlflow.sklearn.log_model(
    #     sk_model=bagging, 
    #     input_example=X_test[:10], 
    #     artifact_path=f"mlflow/{run_name}/model"
    # )

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:   25.5s remaining:   51.2s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:   28.0s remaining:   28.0s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:   28.1s remaining:   14.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:   30.4s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.09124541282653809s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Batch computation too fast (0.09725856

## Категориальный бустинг

In [169]:
model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1_000,
    }
model_cb = catboost.CatBoostClassifier(verbose=10, **model_params)

model_cb.fit(X_train_scaled, y_train)
predicts = model_cb.predict(X_val_scaled)

print(metrics.f1_score(y_train, model_cb.predict(X_train_scaled), average="weighted"))
print()
print(metrics.f1_score(y_val, predicts, average="weighted"))
print(metrics.f1_score(y_val, predicts, average="micro"))
print(metrics.accuracy_score(y_val, predicts))

Learning rate set to 0.056575
0:	learn: 4.5906121	total: 76.8ms	remaining: 1m 16s
10:	learn: 4.2427916	total: 488ms	remaining: 43.9s
20:	learn: 3.9355773	total: 887ms	remaining: 41.4s
30:	learn: 3.6497056	total: 1.25s	remaining: 39.2s
40:	learn: 3.3659098	total: 1.62s	remaining: 37.9s
50:	learn: 3.1430743	total: 1.96s	remaining: 36.5s
60:	learn: 2.8895484	total: 2.37s	remaining: 36.4s
70:	learn: 2.6507566	total: 2.75s	remaining: 36s
80:	learn: 2.4154144	total: 3.16s	remaining: 35.9s
90:	learn: 2.2270211	total: 3.54s	remaining: 35.4s
100:	learn: 2.0283282	total: 3.95s	remaining: 35.1s
110:	learn: 1.8484008	total: 4.36s	remaining: 34.9s
120:	learn: 1.6879849	total: 4.76s	remaining: 34.6s
130:	learn: 1.5397555	total: 5.18s	remaining: 34.4s
140:	learn: 1.4058073	total: 5.6s	remaining: 34.1s
150:	learn: 1.2786311	total: 6s	remaining: 33.7s
160:	learn: 1.1653281	total: 6.41s	remaining: 33.4s
170:	learn: 1.0672422	total: 6.82s	remaining: 33.1s
180:	learn: 0.9813127	total: 7.23s	remaining: 32.

In [171]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.set_experiment(f"only mfcc {len(set(y))} classes")

run_name = "catboost normalized base"

with mlflow.start_run(run_name=run_name) as run:
    model_params = {
        "task_type": 'GPU',
        "devices": '0',
        "iterations": 1000,
    }
    model_cb = catboost.CatBoostClassifier(verbose=10, **model_params)
    model_cb.fit(X_train_scaled, y_train)
    predicts = model_cb.predict(X_val_scaled)

    mlflow.log_metric("train f1_weighted", metrics.f1_score(y_train, model_cb.predict(X_train_scaled), average="weighted"))
    mlflow.log_metric("f1_weighted", metrics.f1_score(y_val, predicts, average="weighted"))
    mlflow.log_metric("f1_micro", metrics.f1_score(y_val, predicts, average="micro"))
    mlflow.log_metric("accuracy", metrics.accuracy_score(y_val, predicts))
    
    mlflow.log_params(model_params)

    # mlflow.sklearn.log_model(
    #     sk_model=model_cb, 
    #     input_example=X_val[:10], 
    #     artifact_path=f"mlflow/{run_name}/model"
    # )

Learning rate set to 0.056575
0:	learn: 4.5906121	total: 36.4ms	remaining: 36.3s
10:	learn: 4.2427916	total: 416ms	remaining: 37.4s
20:	learn: 3.9355773	total: 803ms	remaining: 37.4s
30:	learn: 3.6497056	total: 1.2s	remaining: 37.6s
40:	learn: 3.3659098	total: 1.57s	remaining: 36.8s
50:	learn: 3.1430743	total: 1.92s	remaining: 35.8s
60:	learn: 2.8895484	total: 2.32s	remaining: 35.8s
70:	learn: 2.6507566	total: 2.78s	remaining: 36.3s
80:	learn: 2.4154144	total: 3.31s	remaining: 37.5s
90:	learn: 2.2270211	total: 3.69s	remaining: 36.8s
100:	learn: 2.0283282	total: 4.1s	remaining: 36.5s
110:	learn: 1.8484008	total: 4.51s	remaining: 36.1s
120:	learn: 1.6879849	total: 4.91s	remaining: 35.7s
130:	learn: 1.5397555	total: 5.33s	remaining: 35.4s
140:	learn: 1.4058073	total: 5.76s	remaining: 35.1s
150:	learn: 1.2786311	total: 6.16s	remaining: 34.6s
160:	learn: 1.1653281	total: 6.57s	remaining: 34.2s
170:	learn: 1.0672422	total: 6.99s	remaining: 33.9s
180:	learn: 0.9813127	total: 7.41s	remaining: 

In [37]:
preds = model_cb.predict(X_train)
    
print(f'F1: {metrics.f1_score(y_train, preds, average="weighted")}')
print()

F1: 1.0



In [38]:
preds = model_cb.predict(X_test)
    
print(f'F1: {metrics.f1_score(y_test, preds, average="weighted")}')
print()

F1: 0.8558466859937449



In [89]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.50      0.40      0.44         5
           2       0.60      0.60      0.60         5
           3       1.00      0.60      0.75         5
           4       0.80      0.80      0.80         5
           5       1.00      1.00      1.00         5
           6       0.75      0.60      0.67         5
           7       1.00      0.40      0.57         5
           8       0.71      1.00      0.83         5
           9       1.00      1.00      1.00         5
          10       0.83      1.00      0.91         5
          11       1.00      0.80      0.89         5
          12       1.00      1.00      1.00         5
          13       0.67      0.80      0.73         5
          14       1.00      0.80      0.89         5
          15       0.75      0.60      0.67         5
          16       1.00      1.00      1.00         5
          17       0.60    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
preds = model.predict_proba(X_test)
confidences = np.max(preds, axis=1)
pred_label = np.argmax(preds, axis=1)

In [103]:
true_conf = []
false_conf = []
for i, (conf, pred_) in enumerate(zip(confidences, pred_label)):
    if y_test[i] != pred_:
        false_conf.append(conf)
    else:
        true_conf.append(conf)

In [104]:
sum(true_conf)/len(true_conf), sum(false_conf)/len(false_conf)

(0.493037544467759, 0.17304315771004875)

In [99]:
metrics.f1_score(y_test, pred_label, average="weighted")

0.7986898353299151

In [90]:
model.save_model("catboost_5_score0_8")

In [31]:
model = catboost.CatBoostClassifier()

model.load_model("catboost_20_score0_933")

<catboost.core.CatBoostClassifier at 0x1d3c198c4c0>