In [1]:
!pip install catboost scikit-learn



In [2]:
!pip install pymystem3



In [3]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostClassifier, cv
from catboost.text_processing import Tokenizer
from pymystem3 import Mystem
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV

In [4]:
df_answers = pd.read_csv('/Users/darabizina/reviews-summarization/reviews-summarization/notebooks/data/answers_202405192130.csv')
df_aspects = pd.read_csv('/Users/darabizina/reviews-summarization/reviews-summarization/notebooks/data/aspects_202405192134.csv')
df_reviews = pd.read_csv('/Users/darabizina/reviews-summarization/reviews-summarization/notebooks/data/reviews_202405192134.csv')


In [5]:
aspects_map = {
  df_aspects['id'][x]: df_aspects['aspect'][x]
  for x in range(df_aspects.shape[0])
}
reviews_map = {
  df_reviews['id'][x]: df_reviews['review_body'][x]
  for x in range(df_reviews.shape[0])
}

In [6]:
aspect_state = {
  1: 'хорошо',
  0: 'плохо',
  2: 'отсутствует',
  3: 'skip',
}

answers = pd.DataFrame(columns=['review', 'aspect', 'answer'])
for ind, row in df_answers.iterrows():
  review = reviews_map.get(row.review)
  aspect = aspects_map.get(row.aspect)
  answer = aspect_state[row.answer]
  answers.loc[ind] = [review, aspect, answer]

answers.head()

Unnamed: 0,review,aspect,answer
0,Какие только безумно-замечательные идеи не при...,Эмоциональное воздействие,плохо
1,"Этот фильм, можно сказать, путеводитель по стр...",Юмор,плохо
2,На фильм я пошел благодаря высокому рейтингу К...,Визуальные эффекты,отсутствует
3,Возможно ли прожить жизнь с улыбкой на лице? С...,Саундтрек/музыка,отсутствует
4,"Подмечу важное, фильмы 'о мальчике который выж...",Саундтрек/музыка,отсутствует


In [7]:
simple_tokenizer = Tokenizer(lowercasing=True, separator_type='BySense', token_types=['Word', 'Number', 'Punctuation'])

def tokenize_texts(texts):
    return [' '.join(simple_tokenizer.tokenize(text)) for text in texts]

In [8]:
mystem = Mystem()

def lemmatize_text(text):
    lemmas = mystem.lemmatize(text)
    return ''.join(lemmas).strip()

In [9]:
answers['review'] = answers['review'].apply(lambda x: ' '.join(simple_tokenizer.tokenize(x)))
answers['review'] = answers['review'].apply(lemmatize_text)
text_features = ['review']
cat_features = ['aspect']

In [10]:
X = answers.drop(['answer'], axis=1)
y = answers['answer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
aspects_list = [
    'Режиссерская работа', 
    'Саундтрек/музыка',
    'Актерская игра', 
    'Визуальные эффекты',
    'Эмоциональное воздействие',
    'Раскрытие темы', 
    'Оригинальность сюжета',
    'Юмор',
    'Дизайн постановки/костюмов'
]

In [13]:
models = {}

for aspect in aspects_list:
    print(f"Training and evaluating model for aspect: {aspect}")
    
    df_aspect = answers[answers['aspect'] == aspect]
    X_aspect = df_aspect.drop(['answer'], axis=1)
    y_aspect = df_aspect['answer']
    
    X_train_aspect, X_test_aspect, y_train_aspect, y_test_aspect = train_test_split(X_aspect, y_aspect, test_size=0.2, random_state=42)
    
    train_pool_aspect = Pool(
        data=X_train_aspect, 
        label=y_train_aspect, 
        cat_features=cat_features, 
        text_features=text_features,
        feature_names=list(X_train_aspect)
    )
    
    test_pool_aspect = Pool(
        data=X_test_aspect, 
        label=y_test_aspect, 
        cat_features=cat_features, 
        text_features=text_features,
        feature_names=list(X_test_aspect)
    )
    
    model_aspect = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        early_stopping_rounds=10,
        eval_metric='Accuracy'
    )
    
    model_aspect.fit(train_pool_aspect, eval_set=test_pool_aspect, verbose=100)
    
    y_pred_aspect = model_aspect.predict(test_pool_aspect)
    
    print(f"Classification Report for aspect: {aspect}")
    print(classification_report(y_test_aspect, y_pred_aspect))
    
    precision_aspect = precision_score(y_test_aspect, y_pred_aspect, average='macro')
    recall_aspect = recall_score(y_test_aspect, y_pred_aspect, average='macro')
    
    print(f"Precision for aspect {aspect}: {precision_aspect}")
    print(f"Recall for aspect {aspect}: {recall_aspect}")
    print("\n")
    
    models[aspect] = model_aspect

Training and evaluating model for aspect: Режиссерская работа
0:	learn: 0.8055556	test: 0.5789474	best: 0.5789474 (0)	total: 265ms	remaining: 4m 24s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.7368421053
bestIteration = 1

Shrink model to first 2 iterations.
Classification Report for aspect: Режиссерская работа
              precision    recall  f1-score   support

 отсутствует       0.00      0.00      0.00         5
      хорошо       0.74      1.00      0.85        14

    accuracy                           0.74        19
   macro avg       0.37      0.50      0.42        19
weighted avg       0.54      0.74      0.63        19

Precision for aspect Режиссерская работа: 0.3684210526315789
Recall for aspect Режиссерская работа: 0.5


Training and evaluating model for aspect: Саундтрек/музыка


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.7564103	test: 0.2500000	best: 0.2500000 (0)	total: 338ms	remaining: 5m 37s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.85
bestIteration = 3

Shrink model to first 4 iterations.
Classification Report for aspect: Саундтрек/музыка
              precision    recall  f1-score   support

 отсутствует       0.77      1.00      0.87        10
      хорошо       1.00      0.70      0.82        10

    accuracy                           0.85        20
   macro avg       0.88      0.85      0.85        20
weighted avg       0.88      0.85      0.85        20

Precision for aspect Саундтрек/музыка: 0.8846153846153846
Recall for aspect Саундтрек/музыка: 0.85


Training and evaluating model for aspect: Актерская игра
0:	learn: 0.7580645	test: 0.6875000	best: 0.6875000 (0)	total: 214ms	remaining: 3m 33s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.8125
bestIteration = 2

Shrink model to first 3 iterations.
Classification Report for aspect: Ак

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.8823529412
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Визуальные эффекты
              precision    recall  f1-score   support

 отсутствует       0.88      1.00      0.94        15
      хорошо       0.00      0.00      0.00         2

    accuracy                           0.88        17
   macro avg       0.44      0.50      0.47        17
weighted avg       0.78      0.88      0.83        17

Precision for aspect Визуальные эффекты: 0.4411764705882353
Recall for aspect Визуальные эффекты: 0.5


Training and evaluating model for aspect: Эмоциональное воздействие


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.6923077	test: 0.3846154	best: 0.3846154 (0)	total: 184ms	remaining: 3m 3s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.4615384615
bestIteration = 1

Shrink model to first 2 iterations.
Classification Report for aspect: Эмоциональное воздействие
              precision    recall  f1-score   support

 отсутствует       1.00      0.25      0.40         4
       плохо       0.00      0.00      0.00         4
      хорошо       0.42      1.00      0.59         5

    accuracy                           0.46        13
   macro avg       0.47      0.42      0.33        13
weighted avg       0.47      0.46      0.35        13

Precision for aspect Эмоциональное воздействие: 0.47222222222222227
Recall for aspect Эмоциональное воздействие: 0.4166666666666667


Training and evaluating model for aspect: Раскрытие темы
0:	learn: 0.7954545	test: 0.2500000	best: 0.2500000 (0)	total: 132ms	remaining: 2m 11s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.5833333333
bestIteration = 4

Shrink model to first 5 iterations.
Classification Report for aspect: Раскрытие темы
              precision    recall  f1-score   support

 отсутствует       0.00      0.00      0.00         3
       плохо       0.00      0.00      0.00         2
      хорошо       0.58      1.00      0.74         7

    accuracy                           0.58        12
   macro avg       0.19      0.33      0.25        12
weighted avg       0.34      0.58      0.43        12

Precision for aspect Раскрытие темы: 0.19444444444444445
Recall for aspect Раскрытие темы: 0.3333333333333333


Training and evaluating model for aspect: Оригинальность сюжета


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.6349206	test: 0.5625000	best: 0.5625000 (0)	total: 195ms	remaining: 3m 14s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.5625
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Оригинальность сюжета
              precision    recall  f1-score   support

 отсутствует       0.50      0.50      0.50         6
       плохо       0.00      0.00      0.00         0
      хорошо       0.75      0.60      0.67        10

    accuracy                           0.56        16
   macro avg       0.42      0.37      0.39        16
weighted avg       0.66      0.56      0.60        16

Precision for aspect Оригинальность сюжета: 0.4166666666666667
Recall for aspect Оригинальность сюжета: 0.3666666666666667


Training and evaluating model for aspect: Юмор


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.7656250	test: 0.4117647	best: 0.4117647 (0)	total: 234ms	remaining: 3m 53s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.4117647059
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Юмор
              precision    recall  f1-score   support

 отсутствует       0.41      1.00      0.58         7
       плохо       0.00      0.00      0.00         2
      хорошо       0.00      0.00      0.00         8

    accuracy                           0.41        17
   macro avg       0.14      0.33      0.19        17
weighted avg       0.17      0.41      0.24        17

Precision for aspect Юмор: 0.1372549019607843
Recall for aspect Юмор: 0.3333333333333333


Training and evaluating model for aspect: Дизайн постановки/костюмов


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


CatBoostError: /Users/zomb-ml-platform-msk/go-agent-21.2.0/pipelines/BuildMaster/catboost.git/catboost/private/libs/algo/data.cpp:198: Dataset test #0 contains class label "\xD0\xBF\xD0\xBB\xD0\xBE\xD1\x85\xD0\xBE" that is not present in the learn dataset