In [1]:
!pip install catboost scikit-learn



In [2]:
!pip install pymystem3



In [3]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostClassifier, cv
from catboost.text_processing import Tokenizer
from pymystem3 import Mystem
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [4]:
df_answers = pd.read_csv('/Users/darabizina/reviews-summarization/reviews-summarization/notebooks/data/ans_with_alice.csv')
df_aspects = pd.read_csv('/Users/darabizina/reviews-summarization/reviews-summarization/notebooks/data/aspects_202405192134.csv')
df_reviews = pd.read_csv('/Users/darabizina/reviews-summarization/reviews-summarization/notebooks/data/reviews_202405192134.csv')


In [5]:
aspects_map = {
  df_aspects['id'][x]: df_aspects['aspect'][x]
  for x in range(df_aspects.shape[0])
}
reviews_map = {
  df_reviews['id'][x]: df_reviews['review_body'][x]
  for x in range(df_reviews.shape[0])
}

In [6]:
aspect_state = {
  1: 'хорошо',
  0: 'плохо',
  2: 'отсутствует',
  3: 'skip',
}

answers = pd.DataFrame(columns=['review', 'aspect', 'answer'])
for ind, row in df_answers.iterrows():
  review = reviews_map.get(row.review)
  aspect = aspects_map.get(row.aspect)
  answer = aspect_state[row.answer]
  answers.loc[ind] = [review, aspect, answer]

answers.head()

Unnamed: 0,review,aspect,answer
0,Какие только безумно-замечательные идеи не при...,Эмоциональное воздействие,плохо
1,"Этот фильм, можно сказать, путеводитель по стр...",Юмор,плохо
2,На фильм я пошел благодаря высокому рейтингу К...,Визуальные эффекты,отсутствует
3,Возможно ли прожить жизнь с улыбкой на лице? С...,Саундтрек/музыка,отсутствует
4,"Подмечу важное, фильмы 'о мальчике который выж...",Саундтрек/музыка,отсутствует


In [7]:
simple_tokenizer = Tokenizer(lowercasing=True, separator_type='BySense', token_types=['Word', 'Number', 'Punctuation'])

def tokenize_texts(texts):
    return [' '.join(simple_tokenizer.tokenize(text)) for text in texts]

In [8]:
mystem = Mystem()

def lemmatize_text(text):
    lemmas = mystem.lemmatize(text)
    return ''.join(lemmas).strip()

In [9]:
answers['review'] = answers['review'].apply(lambda x: ' '.join(simple_tokenizer.tokenize(x)))
answers['review'] = answers['review'].apply(lemmatize_text)
text_features = ['review']
cat_features = ['aspect']

In [10]:
X = answers.drop(['answer'], axis=1)
y = answers['answer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
aspects_list = [
    'Режиссерская работа', 
    'Саундтрек/музыка',
    'Актерская игра', 
    'Визуальные эффекты',
    'Эмоциональное воздействие',
    'Раскрытие темы', 
    'Оригинальность сюжета',
    'Юмор',
    'Дизайн постановки/костюмов'
]

In [12]:
models = {}

for aspect in aspects_list:
    print(f"Training and evaluating model for aspect: {aspect}")
    
    df_aspect = answers[answers['aspect'] == aspect]
    X_aspect = df_aspect.drop(['answer'], axis=1)
    y_aspect = df_aspect['answer']
    
    X_train_aspect, X_test_aspect, y_train_aspect, y_test_aspect = train_test_split(X_aspect, y_aspect, test_size=0.2, random_state=42)
    
    train_pool_aspect = Pool(
        data=X_train_aspect, 
        label=y_train_aspect, 
        cat_features=cat_features, 
        text_features=text_features,
        feature_names=list(X_train_aspect)
    )
    
    test_pool_aspect = Pool(
        data=X_test_aspect, 
        label=y_test_aspect, 
        cat_features=cat_features, 
        text_features=text_features,
        feature_names=list(X_test_aspect)
    )
    
    model_aspect = CatBoostClassifier(
        iterations=10_000,
        learning_rate=0.01,
        depth=6,
        early_stopping_rounds=20,
        eval_metric='Accuracy'
    )
    
    model_aspect.fit(train_pool_aspect, eval_set=test_pool_aspect, verbose=100)
    
    y_pred_aspect = model_aspect.predict(test_pool_aspect)
    
    print(f"Classification Report for aspect: {aspect}")
    print(classification_report(y_test_aspect, y_pred_aspect))
    
    precision_aspect = precision_score(y_test_aspect, y_pred_aspect, average='macro')
    recall_aspect = recall_score(y_test_aspect, y_pred_aspect, average='macro')
    
    print(f"Precision for aspect {aspect}: {precision_aspect}")
    print(f"Recall for aspect {aspect}: {recall_aspect}")
    print("\n")
    
    models[aspect] = model_aspect

Training and evaluating model for aspect: Режиссерская работа
0:	learn: 0.7948718	test: 0.7848101	best: 0.7848101 (0)	total: 827ms	remaining: 2h 17m 48s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8101265823
bestIteration = 1

Shrink model to first 2 iterations.
Classification Report for aspect: Режиссерская работа
              precision    recall  f1-score   support

 отсутствует       0.00      0.00      0.00         7
       плохо       0.00      0.00      0.00         8
      хорошо       0.81      1.00      0.90        64

    accuracy                           0.81        79
   macro avg       0.27      0.33      0.30        79
weighted avg       0.66      0.81      0.73        79

Precision for aspect Режиссерская работа: 0.270042194092827
Recall for aspect Режиссерская работа: 0.3333333333333333


Training and evaluating model for aspect: Саундтрек/музыка


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.5798611	test: 0.6027397	best: 0.6027397 (0)	total: 382ms	remaining: 1h 3m 41s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.602739726
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Саундтрек/музыка
              precision    recall  f1-score   support

 отсутствует       0.80      0.14      0.24        29
       плохо       0.00      0.00      0.00         3
      хорошо       0.59      0.98      0.73        41

    accuracy                           0.60        73
   macro avg       0.46      0.37      0.32        73
weighted avg       0.65      0.60      0.51        73

Precision for aspect Саундтрек/музыка: 0.46274509803921565
Recall for aspect Саундтрек/музыка: 0.3711802635267732


Training and evaluating model for aspect: Актерская игра


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.8006536	test: 0.7532468	best: 0.7532468 (0)	total: 494ms	remaining: 1h 22m 14s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7662337662
bestIteration = 1

Shrink model to first 2 iterations.
Classification Report for aspect: Актерская игра
              precision    recall  f1-score   support

 отсутствует       0.00      0.00      0.00        16
       плохо       0.00      0.00      0.00         2
      хорошо       0.77      1.00      0.87        59

    accuracy                           0.77        77
   macro avg       0.26      0.33      0.29        77
weighted avg       0.59      0.77      0.66        77

Precision for aspect Актерская игра: 0.2554112554112554
Recall for aspect Актерская игра: 0.3333333333333333


Training and evaluating model for aspect: Визуальные эффекты


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.5782313	test: 0.5270270	best: 0.5270270 (0)	total: 440ms	remaining: 1h 13m 20s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.5675675676
bestIteration = 3

Shrink model to first 4 iterations.
Classification Report for aspect: Визуальные эффекты
              precision    recall  f1-score   support

 отсутствует       0.80      0.12      0.21        34
       плохо       0.00      0.00      0.00         1
      хорошо       0.55      0.97      0.70        39

    accuracy                           0.57        74
   macro avg       0.45      0.36      0.30        74
weighted avg       0.66      0.57      0.47        74

Precision for aspect Визуальные эффекты: 0.45024154589371984
Recall for aspect Визуальные эффекты: 0.3640020110608346


Training and evaluating model for aspect: Эмоциональное воздействие


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.8287671	test: 0.8493151	best: 0.8493151 (0)	total: 435ms	remaining: 1h 12m 34s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8493150685
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Эмоциональное воздействие
              precision    recall  f1-score   support

 отсутствует       0.00      0.00      0.00         2
       плохо       1.00      0.10      0.18        10
      хорошо       0.86      1.00      0.92        61

    accuracy                           0.85        73
   macro avg       0.62      0.37      0.37        73
weighted avg       0.85      0.85      0.80        73

Precision for aspect Эмоциональное воздействие: 0.6197183098591549
Recall for aspect Эмоциональное воздействие: 0.3666666666666667


Training and evaluating model for aspect: Раскрытие темы
0:	learn: 0.8029197	test: 0.7826087	best: 0.7826087 (0)	total: 441ms	remaining: 1h 13m 26s
Stopped by overfitting detector  (20 iterations wait)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.7137809	test: 0.6478873	best: 0.6478873 (0)	total: 494ms	remaining: 1h 22m 19s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6478873239
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Оригинальность сюжета
              precision    recall  f1-score   support

 отсутствует       0.50      0.06      0.11        17
       плохо       0.00      0.00      0.00         8
      хорошо       0.65      0.98      0.78        46

    accuracy                           0.65        71
   macro avg       0.38      0.35      0.30        71
weighted avg       0.54      0.65      0.53        71

Precision for aspect Оригинальность сюжета: 0.38405797101449274
Recall for aspect Оригинальность сюжета: 0.345694799658994


Training and evaluating model for aspect: Юмор


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.5615942	test: 0.4428571	best: 0.4428571 (0)	total: 632ms	remaining: 1h 45m 21s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4428571429
bestIteration = 0

Shrink model to first 1 iterations.
Classification Report for aspect: Юмор
              precision    recall  f1-score   support

 отсутствует       0.67      0.22      0.33        37
       плохо       0.00      0.00      0.00         6
      хорошо       0.40      0.85      0.54        27

    accuracy                           0.44        70
   macro avg       0.35      0.36      0.29        70
weighted avg       0.51      0.44      0.38        70

Precision for aspect Юмор: 0.35440613026819917
Recall for aspect Юмор: 0.3560226893560227


Training and evaluating model for aspect: Дизайн постановки/костюмов


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.6230769	test: 0.4545455	best: 0.4545455 (0)	total: 579ms	remaining: 1h 36m 28s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.5151515152
bestIteration = 1

Shrink model to first 2 iterations.
Classification Report for aspect: Дизайн постановки/костюмов
              precision    recall  f1-score   support

 отсутствует       1.00      0.12      0.21        34
       плохо       0.00      0.00      0.00         2
      хорошо       0.48      1.00      0.65        30

    accuracy                           0.52        66
   macro avg       0.49      0.37      0.29        66
weighted avg       0.74      0.52      0.40        66

Precision for aspect Дизайн постановки/костюмов: 0.4946236559139785
Recall for aspect Дизайн постановки/костюмов: 0.37254901960784315




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
joblib.dump(models, 'trained_models.joblib')
loaded_models = joblib.load('trained_models.joblib')