In [4]:
!python3 -m pip install catboost scikit-learn ipywidgets ipython transformers
!python3 -m pip install transformers
!python3 -m pip install torch 

Defaulting to user installation because normal site-packages is not writeable
Collecting catboost
  Downloading catboost-1.2.5-cp39-cp39-macosx_11_0_universal2.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Using cached plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)
Downloading catboost-1.2.5-cp39-cp39-macosx_11_0_universal2.whl (26.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.3/26.3 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hUsing cached graphviz-0.20.3-py3-none-any.whl (47 kB)
Using cached plotly-5.22.0-py3-none-any.whl (16.4 MB)
Installing collected packages: plotly, graphviz, catboost
Successfully installed catboost-1.2.5 graphviz-0.20.3 plotly-5.22.0
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packag

In [15]:
import sys
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import Pool, CatBoostClassifier, cv

sys.path.append(str(Path().resolve().parent / 'data'))
import honeypots
import database
import kinopoisk


In [2]:
db = database.make_database()

def load_table(table_name):
  df = pd.DataFrame(columns=[x[0] for x in db.list_query(f'SHOW COLUMNS FROM {table_name};')])
  for i, item in enumerate(db.list_query(f'SELECT * FROM {table_name};')):
    df.loc[i] = [x for x in item]
  return df

In [3]:
df_answers = load_table('answers')
df_aspects = load_table('aspects')
df_reviews = load_table('reviews')

print(df_answers.head())
print(df_answers.shape[0])

                                 review                                aspect  \
0  81ca6e9c-9247-4c5a-9461-56222f1d2943  c9c33581-7574-4b5c-9fea-184977d30231   
1  1d2c9802-a593-49e5-8981-64849a70e9a6  bb6625fd-7d5e-43a8-8709-ff55d3dce54e   
2  1f36dc51-7ce2-489c-9da6-521be692f1f6  7544eeab-22fa-4c9e-bb73-106b67e75be2   
3  65c6e3aa-35c2-479a-9406-69624d6c796d  b6c1ca7e-40a8-4322-b83e-d0da892dd1df   
4  e5c56a65-2cdc-4ec6-ab05-6b99f4ba8afc  b6c1ca7e-40a8-4322-b83e-d0da892dd1df   

   answer            ip session  
0       0  3.78.232.236    None  
1       0  3.78.232.236    None  
2       2  46.39.53.204    None  
3       2  46.39.53.204    None  
4       2  46.39.53.204    None  
8294


In [8]:
aspects_map = {
  df_aspects['id'][x]: df_aspects['aspect'][x]
  for x in range(df_aspects.shape[0])
}
reviews_map = {
  df_reviews['id'][x]: df_reviews['review_body'][x]
  for x in range(df_reviews.shape[0])
}
aspect_state = {1: 'хорошо', 0: 'плохо', 2: 'отсутствует'}

In [9]:
answers = pd.DataFrame(columns=['review', 'aspect', 'answer', 'session'])
user_answers = honeypots.check_users()
user_threshold = 0.17
bad_users = []

for k, v in user_answers.items():
  (incorrect, all) = v
  print(k, incorrect / all, all)
  if incorrect / all < user_threshold: continue
  bad_users.append(k)

for ind, row in df_answers.iterrows():
  review = reviews_map.get(row.review)
  aspect = aspects_map.get(row.aspect)
  answer = aspect_state[row.answer]
  answers.loc[ind] = [review, aspect, answer, row.session]


print('DB length:', answers.shape)
answers = answers.loc[~answers['session'].isin(bad_users)]
print('Resulting:', answers.shape[0])

text_features = ['review']
cat_features = ['aspect']

df3e3dd306f652614e0fc711d2537dccf2e4851a644ee8f916acf4037377e113 0.0 5
975b64d3b54833adc31c16eb8bddcf8bc86e040819216e569ad346790f73dba3 0.3333333333333333 9
3101bc266e078cd1a600efb3223fc36dbe24c7a933c8b259da11bafebd66cebc 0.2777777777777778 18
aaad60c5bed3af59151f4c9f327d1f000d81ada39adcc4e4aa1ae74c1f3fa8c0 0.4166666666666667 36
05ae6af128ada248193f6f144da134b0630d064ab2aae5aadecd93adcc036eeb 0.0 9
2fb96f42c7d8788d63aea6af5ed08e7b324938a25ddba557b3771631c74b4f23 0.1111111111111111 9
d0889bcb8898dd823051db6a80ab0bcd6f3a94dd5e6e2e879378323589f2a0ae 0.2222222222222222 9
5a97d30de73898a9a6d82de14f9b0db1fef4d115bc7c9cea7f5e3211815ce5eb 0.3333333333333333 6
93cc6560a6dbf336f1318c1fc9bbacc80c0a51245c5e4d94211f37ed29f437e4 0.25 36
aa63ec02223a4d02b0f3cd2509c4d09d5f699d035510827a8672fb1ab3297a52 0.37142857142857144 35
04d2b139aa71156cd245948e99607840a22b0e514a4bc27fd6f3b8a23865baa3 0.3888888888888889 18
44e2c69788be21627a36e0b7df8b3962b5e802a685983f1737d522b720b3f06a 0.3611111111111111 36
65ad9

In [10]:
def process(data, aspect):
  df = data.copy()
  df = df[df['aspect'] == aspect]
  df = df.drop(['aspect', 'session'], axis=1)
  return df.drop(['answer'], axis=1), df['answer']


In [11]:
catboost_default_params = {
  'iterations': 1000,
  'learning_rate': 0.05,
  'custom_loss': ['Accuracy', 'AUC', 'TotalF1'],
  'loss_function': 'MultiClass',
}

In [12]:
best_model = {}

for aspect in df_aspects['aspect']:
  X, Y = process(answers, aspect)
  print('-' * 10, aspect, X.shape[0], '-' * 10)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.22, random_state=42, shuffle=True)
  pool_params = { 'text_features': list(X), 'feature_names': list(X) }
  learn_pool = Pool(X_train, Y_train, **pool_params)
  test_pool = Pool(X_test, Y_test, **pool_params)
  all_pool = Pool(X, Y, **pool_params)

  model = CatBoostClassifier(**catboost_default_params)
  model.fit(learn_pool, eval_set=test_pool, early_stopping_rounds=100, plot=True, verbose=100)
  best_model[aspect] = model

  comp = pd.DataFrame({'Y_true': Y.iloc[:].values, 'Y_pred': [y for x in model.predict(X) for y in x]})
  print(comp.head())
  print(classification_report(comp['Y_true'], comp['Y_pred'], target_names=list(set(Y))))



---------- Режиссерская работа 726 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0852041	test: 1.0865695	best: 1.0865695 (0)	total: 231ms	remaining: 3m 50s
100:	learn: 0.7412422	test: 0.8252035	best: 0.8252035 (100)	total: 15.8s	remaining: 2m 20s
200:	learn: 0.5714427	test: 0.7965928	best: 0.7962195 (197)	total: 30.5s	remaining: 2m 1s
300:	learn: 0.4362410	test: 0.7851338	best: 0.7845528 (294)	total: 45s	remaining: 1m 44s
400:	learn: 0.3531009	test: 0.7940310	best: 0.7843677 (304)	total: 1m	remaining: 1m 30s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7843676827
bestIteration = 304

Shrink model to first 305 iterations.
        Y_true       Y_pred
0  отсутствует       хорошо
1  отсутствует  отсутствует
2        плохо       хорошо
3       хорошо       хорошо
4       хорошо       хорошо
set()
              precision    recall  f1-score   support

 отсутствует       0.96      0.72      0.82       202
      хорошо       0.95      0.88      0.92       117
       плохо       0.86      0.99      0.92       407

    accuracy            

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0731284	test: 1.0746968	best: 1.0746968 (0)	total: 130ms	remaining: 2m 9s
100:	learn: 0.5713220	test: 0.7163342	best: 0.7158083 (99)	total: 13.8s	remaining: 2m 3s
200:	learn: 0.4051091	test: 0.6789604	best: 0.6789604 (200)	total: 26.6s	remaining: 1m 45s
300:	learn: 0.2932491	test: 0.6744273	best: 0.6714849 (274)	total: 41.3s	remaining: 1m 35s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6714849002
bestIteration = 274

Shrink model to first 275 iterations.
        Y_true       Y_pred
0  отсутствует  отсутствует
1  отсутствует  отсутствует
2       хорошо       хорошо
3  отсутствует  отсутствует
4       хорошо       хорошо
set()
              precision    recall  f1-score   support

 отсутствует       0.88      1.00      0.93       423
      хорошо       1.00      0.24      0.38        34
       плохо       0.99      0.85      0.91       221

    accuracy                           0.91       678
   macro avg       0.95      0.69      0.74       678
weig

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0747781	test: 1.0683035	best: 1.0683035 (0)	total: 130ms	remaining: 2m 9s
100:	learn: 0.6573019	test: 0.6646437	best: 0.6646437 (100)	total: 14.8s	remaining: 2m 11s
200:	learn: 0.5013615	test: 0.6172713	best: 0.6172713 (200)	total: 29.1s	remaining: 1m 55s
300:	learn: 0.3808008	test: 0.5944421	best: 0.5944421 (300)	total: 43.9s	remaining: 1m 41s
400:	learn: 0.3078505	test: 0.5841215	best: 0.5831614 (392)	total: 58.5s	remaining: 1m 27s
500:	learn: 0.2540059	test: 0.5853640	best: 0.5828662 (456)	total: 1m 14s	remaining: 1m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5828662474
bestIteration = 456

Shrink model to first 457 iterations.
        Y_true       Y_pred
0  отсутствует  отсутствует
1       хорошо       хорошо
2       хорошо       хорошо
3       хорошо       хорошо
4       хорошо       хорошо
set()
              precision    recall  f1-score   support

 отсутствует       0.97      0.91      0.94       190
      хорошо       0.97      0.77   

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0588248	test: 1.0606591	best: 1.0606591 (0)	total: 164ms	remaining: 2m 43s
100:	learn: 0.5489015	test: 0.6953202	best: 0.6952552 (99)	total: 14.6s	remaining: 2m 9s
200:	learn: 0.4011669	test: 0.6698621	best: 0.6689399 (181)	total: 28.8s	remaining: 1m 54s
300:	learn: 0.3003270	test: 0.6592656	best: 0.6573972 (291)	total: 43.1s	remaining: 1m 40s
400:	learn: 0.2362477	test: 0.6605609	best: 0.6555936 (330)	total: 57.3s	remaining: 1m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6555935774
bestIteration = 330

Shrink model to first 331 iterations.
        Y_true       Y_pred
0  отсутствует  отсутствует
1       хорошо       хорошо
2  отсутствует  отсутствует
3  отсутствует  отсутствует
4  отсутствует  отсутствует
set()
              precision    recall  f1-score   support

 отсутствует       0.91      0.99      0.95       544
      хорошо       0.92      0.62      0.74        53
       плохо       0.99      0.79      0.88       160

    accuracy        

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0563140	test: 1.0435647	best: 1.0435647 (0)	total: 194ms	remaining: 3m 14s
100:	learn: 0.5012244	test: 0.4645550	best: 0.4645550 (100)	total: 13.8s	remaining: 2m 3s
200:	learn: 0.3677990	test: 0.4323617	best: 0.4320584 (197)	total: 27.8s	remaining: 1m 50s
300:	learn: 0.2762804	test: 0.4162781	best: 0.4143420 (284)	total: 42.1s	remaining: 1m 37s
400:	learn: 0.2191435	test: 0.4109161	best: 0.4109135 (399)	total: 56.1s	remaining: 1m 23s
500:	learn: 0.1764547	test: 0.4096541	best: 0.4095503 (499)	total: 1m 9s	remaining: 1m 9s
600:	learn: 0.1454613	test: 0.4099839	best: 0.4085952 (536)	total: 1m 23s	remaining: 55.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4085952134
bestIteration = 536

Shrink model to first 537 iterations.
        Y_true  Y_pred
0        плохо   плохо
1  отсутствует  хорошо
2       хорошо  хорошо
3       хорошо  хорошо
4  отсутствует  хорошо
set()
              precision    recall  f1-score   support

 отсутствует       1.00      0.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0716396	test: 1.0660113	best: 1.0660113 (0)	total: 177ms	remaining: 2m 56s
100:	learn: 0.6428077	test: 0.6682896	best: 0.6682896 (100)	total: 14.9s	remaining: 2m 12s
200:	learn: 0.4835013	test: 0.6222052	best: 0.6222052 (200)	total: 28.2s	remaining: 1m 52s
300:	learn: 0.3686747	test: 0.6071139	best: 0.6071139 (300)	total: 41.9s	remaining: 1m 37s
400:	learn: 0.2940659	test: 0.5904994	best: 0.5903602 (399)	total: 55.6s	remaining: 1m 23s
500:	learn: 0.2398164	test: 0.5789709	best: 0.5784691 (498)	total: 1m 9s	remaining: 1m 8s
600:	learn: 0.2012897	test: 0.5745296	best: 0.5736441 (555)	total: 1m 22s	remaining: 55s
700:	learn: 0.1707335	test: 0.5720579	best: 0.5710045 (666)	total: 1m 38s	remaining: 41.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5710045387
bestIteration = 666

Shrink model to first 667 iterations.
        Y_true  Y_pred
0       хорошо  хорошо
1       хорошо  хорошо
2  отсутствует  хорошо
3       хорошо  хорошо
4       хорошо  хорошо
se

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0837939	test: 1.0856477	best: 1.0856477 (0)	total: 158ms	remaining: 2m 38s
100:	learn: 0.7535434	test: 0.8904306	best: 0.8895285 (92)	total: 14.1s	remaining: 2m 5s
200:	learn: 0.5675592	test: 0.8669780	best: 0.8666161 (198)	total: 28.1s	remaining: 1m 51s
300:	learn: 0.4417281	test: 0.8606130	best: 0.8606130 (300)	total: 41.6s	remaining: 1m 36s
400:	learn: 0.3542454	test: 0.8636773	best: 0.8561680 (334)	total: 55.3s	remaining: 1m 22s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8561679759
bestIteration = 334

Shrink model to first 335 iterations.
   Y_true       Y_pred
0  хорошо       хорошо
1  хорошо  отсутствует
2   плохо  отсутствует
3   плохо        плохо
4  хорошо       хорошо
set()
              precision    recall  f1-score   support

 отсутствует       0.83      0.98      0.90       410
      хорошо       0.89      0.62      0.73        82
       плохо       0.98      0.79      0.87       262

    accuracy                           0.87       

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0723462	test: 1.0755260	best: 1.0755260 (0)	total: 147ms	remaining: 2m 27s
100:	learn: 0.6405079	test: 0.8035583	best: 0.8035583 (100)	total: 13.4s	remaining: 1m 59s
200:	learn: 0.4720670	test: 0.7725157	best: 0.7715759 (198)	total: 27.5s	remaining: 1m 49s
300:	learn: 0.3539388	test: 0.7610053	best: 0.7608835 (251)	total: 41s	remaining: 1m 35s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7608835289
bestIteration = 251

Shrink model to first 252 iterations.
        Y_true       Y_pred
0        плохо  отсутствует
1  отсутствует  отсутствует
2  отсутствует  отсутствует
3  отсутствует  отсутствует
4  отсутствует  отсутствует
set()
              precision    recall  f1-score   support

 отсутствует       0.87      1.00      0.93       448
      хорошо       1.00      0.77      0.87        70
       плохо       0.99      0.75      0.85       210

    accuracy                           0.90       728
   macro avg       0.95      0.84      0.88       728
wei

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0454024	test: 1.0435741	best: 1.0435741 (0)	total: 156ms	remaining: 2m 36s
100:	learn: 0.4086141	test: 0.4519894	best: 0.4519894 (100)	total: 13.2s	remaining: 1m 57s
200:	learn: 0.2979268	test: 0.4387562	best: 0.4387562 (200)	total: 26.9s	remaining: 1m 46s
300:	learn: 0.2205389	test: 0.4343241	best: 0.4317872 (261)	total: 40.8s	remaining: 1m 34s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4317871503
bestIteration = 261

Shrink model to first 262 iterations.
        Y_true       Y_pred
0  отсутствует  отсутствует
1  отсутствует  отсутствует
2  отсутствует  отсутствует
3  отсутствует  отсутствует
4  отсутствует  отсутствует
set()
              precision    recall  f1-score   support

 отсутствует       0.90      1.00      0.95       592
      хорошо       1.00      0.35      0.51        26
       плохо       1.00      0.48      0.65        96

    accuracy                           0.91       714
   macro avg       0.97      0.61      0.70       714
w

In [13]:
result = {}

for aspect in df_aspects['aspect']:
  X, Y = process(answers, aspect)
  print('-' * 10, aspect, X.shape[0], '-' * 10)
  X_train, _, Y_train, _ = train_test_split(X, Y, test_size=0.00001, random_state=42, shuffle=True)
  pool_params = { 'text_features': list(X), 'feature_names': list(X) }
  all_pool = Pool(X_train, Y_train, **pool_params)
  catboost_default_params.update({ 'iterations': int(best_model[aspect].tree_count_ * 1.2) })

  model = CatBoostClassifier(**catboost_default_params)
  model.fit(all_pool, early_stopping_rounds=100, plot=True, verbose=100)
  result[aspect] = model

  comp = pd.DataFrame({'Y_true': Y.iloc[:].values, 'Y_pred': [y for x in model.predict(X) for y in x]})
  print(comp.head())
  print(classification_report(comp['Y_true'], comp['Y_pred'], target_names=list(set(Y))))


---------- Режиссерская работа 726 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0788298	total: 120ms	remaining: 43.8s
100:	learn: 0.7448086	total: 15.6s	remaining: 40.8s
200:	learn: 0.5863572	total: 29.5s	remaining: 24.2s
300:	learn: 0.4672512	total: 44.5s	remaining: 9.61s
365:	learn: 0.4110612	total: 55.1s	remaining: 0us
        Y_true       Y_pred
0  отсутствует  отсутствует
1  отсутствует  отсутствует
2        плохо        плохо
3       хорошо       хорошо
4       хорошо       хорошо
              precision    recall  f1-score   support

 отсутствует       0.99      0.98      0.99       202
      хорошо       1.00      1.00      1.00       117
       плохо       0.99      1.00      0.99       407

    accuracy                           0.99       726
   macro avg       1.00      0.99      0.99       726
weighted avg       0.99      0.99      0.99       726

---------- Саундтрек/музыка 678 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0665834	total: 157ms	remaining: 51.8s
100:	learn: 0.5860306	total: 13.9s	remaining: 31.5s
200:	learn: 0.4324729	total: 27.4s	remaining: 17.6s
300:	learn: 0.3294525	total: 42.9s	remaining: 4.13s
329:	learn: 0.3071889	total: 46.8s	remaining: 0us
        Y_true       Y_pred
0  отсутствует  отсутствует
1  отсутствует  отсутствует
2       хорошо       хорошо
3  отсутствует  отсутствует
4       хорошо       хорошо
              precision    recall  f1-score   support

 отсутствует       0.93      1.00      0.96       423
      хорошо       1.00      0.47      0.64        34
       плохо       1.00      0.92      0.96       221

    accuracy                           0.95       678
   macro avg       0.97      0.80      0.85       678
weighted avg       0.95      0.95      0.94       678

---------- Актерская игра 766 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0703591	total: 144ms	remaining: 1m 18s
100:	learn: 0.6422072	total: 13.7s	remaining: 1m
200:	learn: 0.5060295	total: 29.1s	remaining: 50.3s
300:	learn: 0.4052536	total: 45.4s	remaining: 37.3s
400:	learn: 0.3365444	total: 1m	remaining: 22.2s
500:	learn: 0.2845116	total: 1m 16s	remaining: 7.14s
547:	learn: 0.2635235	total: 1m 22s	remaining: 0us
        Y_true       Y_pred
0  отсутствует  отсутствует
1       хорошо       хорошо
2       хорошо       хорошо
3       хорошо       хорошо
4       хорошо       хорошо
              precision    recall  f1-score   support

 отсутствует       0.98      1.00      0.99       190
      хорошо       1.00      0.80      0.89       111
       плохо       0.95      0.99      0.97       465

    accuracy                           0.97       766
   macro avg       0.98      0.93      0.95       766
weighted avg       0.97      0.97      0.97       766

---------- Визуальные эффекты 757 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0599282	total: 142ms	remaining: 56s
100:	learn: 0.5545348	total: 14.6s	remaining: 42.7s
200:	learn: 0.4266200	total: 28.5s	remaining: 27.8s
300:	learn: 0.3278555	total: 43s	remaining: 13.7s
396:	learn: 0.2649755	total: 57.2s	remaining: 0us
        Y_true       Y_pred
0  отсутствует  отсутствует
1       хорошо       хорошо
2  отсутствует  отсутствует
3  отсутствует  отсутствует
4  отсутствует  отсутствует
              precision    recall  f1-score   support

 отсутствует       0.97      0.99      0.98       544
      хорошо       0.93      0.75      0.83        53
       плохо       1.00      0.99      0.99       160

    accuracy                           0.98       757
   macro avg       0.97      0.91      0.94       757
weighted avg       0.98      0.98      0.98       757

---------- Эмоциональное воздействие 738 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0550680	total: 137ms	remaining: 1m 27s
100:	learn: 0.4812214	total: 15.9s	remaining: 1m 25s
200:	learn: 0.3583236	total: 30.9s	remaining: 1m 8s
300:	learn: 0.2745247	total: 45.6s	remaining: 52s
400:	learn: 0.2197657	total: 1m	remaining: 36.9s
500:	learn: 0.1818698	total: 1m 15s	remaining: 21.6s
600:	learn: 0.1520930	total: 1m 30s	remaining: 6.49s
643:	learn: 0.1408528	total: 1m 37s	remaining: 0us
        Y_true       Y_pred
0        плохо        плохо
1  отсутствует  отсутствует
2       хорошо       хорошо
3       хорошо       хорошо
4  отсутствует       хорошо
              precision    recall  f1-score   support

 отсутствует       1.00      0.42      0.59        43
      хорошо       1.00      0.99      0.99       160
       плохо       0.95      1.00      0.98       535

    accuracy                           0.96       738
   macro avg       0.98      0.80      0.85       738
weighted avg       0.97      0.96      0.96       738

---------- Раскрытие темы 634 ---------

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0689500	total: 216ms	remaining: 2m 52s
100:	learn: 0.6315742	total: 15.3s	remaining: 1m 46s
200:	learn: 0.4839226	total: 30s	remaining: 1m 29s
300:	learn: 0.3732220	total: 46s	remaining: 1m 16s
400:	learn: 0.3020154	total: 1m	remaining: 1m
500:	learn: 0.2508821	total: 1m 16s	remaining: 45.4s
600:	learn: 0.2127884	total: 1m 31s	remaining: 30.2s
700:	learn: 0.1827730	total: 1m 45s	remaining: 14.9s
799:	learn: 0.1607516	total: 1m 59s	remaining: 0us
        Y_true       Y_pred
0       хорошо       хорошо
1       хорошо       хорошо
2  отсутствует  отсутствует
3       хорошо       хорошо
4       хорошо       хорошо
              precision    recall  f1-score   support

 отсутствует       1.00      0.91      0.95        91
      хорошо       1.00      1.00      1.00       123
       плохо       0.98      1.00      0.99       420

    accuracy                           0.99       634
   macro avg       0.99      0.97      0.98       634
weighted avg       0.99      0.99      0.99 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0822214	total: 196ms	remaining: 1m 18s
100:	learn: 0.7567319	total: 14s	remaining: 41.8s
200:	learn: 0.5974577	total: 27.8s	remaining: 27.8s
300:	learn: 0.4751431	total: 41.5s	remaining: 13.9s
400:	learn: 0.3929203	total: 55.2s	remaining: 138ms
401:	learn: 0.3921654	total: 55.4s	remaining: 0us
   Y_true       Y_pred
0  хорошо       хорошо
1  хорошо       хорошо
2   плохо  отсутствует
3   плохо        плохо
4  хорошо       хорошо
              precision    recall  f1-score   support

 отсутствует       0.96      1.00      0.98       410
      хорошо       1.00      0.90      0.95        82
       плохо       1.00      0.97      0.98       262

    accuracy                           0.98       754
   macro avg       0.99      0.96      0.97       754
weighted avg       0.98      0.98      0.98       754

---------- Юмор 728 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0662527	total: 124ms	remaining: 37.3s
100:	learn: 0.6231348	total: 14.3s	remaining: 28.4s
200:	learn: 0.4806052	total: 27.9s	remaining: 14s
300:	learn: 0.3739694	total: 41.6s	remaining: 138ms
301:	learn: 0.3727917	total: 41.7s	remaining: 0us
        Y_true       Y_pred
0        плохо  отсутствует
1  отсутствует  отсутствует
2  отсутствует  отсутствует
3  отсутствует  отсутствует
4  отсутствует  отсутствует
              precision    recall  f1-score   support

 отсутствует       0.97      1.00      0.98       448
      хорошо       1.00      0.77      0.87        70
       плохо       1.00      1.00      1.00       210

    accuracy                           0.98       728
   macro avg       0.99      0.92      0.95       728
weighted avg       0.98      0.98      0.98       728

---------- Дизайн постановки/костюмов 714 ----------


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0446607	total: 147ms	remaining: 46.2s
100:	learn: 0.3999469	total: 13.3s	remaining: 28s
200:	learn: 0.3004609	total: 27.1s	remaining: 15.2s
300:	learn: 0.2275900	total: 41s	remaining: 1.77s
313:	learn: 0.2197116	total: 43.1s	remaining: 0us
        Y_true       Y_pred
0  отсутствует  отсутствует
1  отсутствует  отсутствует
2  отсутствует  отсутствует
3  отсутствует  отсутствует
4  отсутствует  отсутствует
              precision    recall  f1-score   support

 отсутствует       0.94      1.00      0.97       592
      хорошо       1.00      0.38      0.56        26
       плохо       1.00      0.78      0.88        96

    accuracy                           0.95       714
   macro avg       0.98      0.72      0.80       714
weighted avg       0.95      0.95      0.94       714



In [16]:
for k, v in result.items():
  k = kinopoisk.ASPECTS[k]
  v.save_model(str(Path().parent / 'models' / f'{k}_model'))

In [None]:
fitted_models = {}

for aspect in df_aspects['aspect']:
  X, Y = process(answers, aspect)
  print('-' * 10, aspect, X.shape, '-' * 10)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle=True)
  pool_params = { 'text_features': list(X), 'feature_names': list(X) }
  all_pool = Pool(X, Y, **pool_params)
  
  (results, models) = cv(
    all_pool,
    params=catboost_default_params,
    fold_count=5,
    shuffle=True,
    plot=True,
    verbose=200,
    return_models=True,
    early_stopping_rounds=100
  )
  print(results)
  fitted_models[aspect] = models


