In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from common_lib.data import DataLoader, PreprocessingPipelineBuilder, DataExplorer, DataPreparer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
loader = DataLoader()
mexwell_set = {"link":"mexwell/fake-reviews-dataset", "filename":"fake reviews dataset.csv"}
try:
    mexwell_df = loader.load_from_kaggle(**mexwell_set)
    print(f"Данные успешно загружены из Kaggle.")
    print(f"Исходная размерность DataFrame: {mexwell_df.shape}")
    print("\nПервые 5 строк данных:")
    display(mexwell_df.head()) 
except Exception as e:
    print(f"Ошибка при загрузке данных: {e}")
    mexwell_df = None


../data\mexwell\fake-reviews-dataset\fake reviews dataset.csv
Данные успешно загружены из Kaggle.
Исходная размерность DataFrame: (40432, 4)

Первые 5 строк данных:


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [3]:
explorer = DataExplorer(mexwell_df)
explorer.print_summary(text_column="text_", label_column="label", rating_column="rating")


СВОДКА ПО ДАННЫМ
Размерность данных: (40432, 4)
Количество колонок: 4
Колонки: category, rating, label, text_

Пропущенные значения отсутствуют

АНАЛИЗ ТЕКСТА:
  Длина текста - мин: 24, макс: 2827, среднее: 351.27
  Топ-5 слов: the(157256), a(96886), i(96841), and(87616), it(68590)

РАСПРЕДЕЛЕНИЕ РЕЙТИНГОВ:
  1.0: 2155 (5.3%)
  2.0: 1967 (4.9%)
  3.0: 3786 (9.4%)
  4.0: 7965 (19.7%)
  5.0: 24559 (60.7%)

РЕЙТИНГИ ПО МЕТКАМ (%):
label      CG     OR  Всего
rating                     
1.0      5.26   5.40   5.33
2.0      4.76   4.97   4.86
3.0      9.66   9.07   9.36
4.0     19.39  20.01  19.70
5.0     60.94  60.55  60.74

РАСПРЕДЕЛЕНИЕ МЕТОК:
  CG: 20216 (50.0%)
  OR: 20216 (50.0%)



In [4]:
preparer = DataPreparer(mexwell_df.copy())
preparer.handle_missing_values(strategy='drop', columns=['text_', 'label'])
preparer.drop_duplicates(subset=['text_', 'label'])
preparer.encode_labels(label_column='label')
label_mapping = preparer.get_label_mapping()
df_prepared = preparer.get_result()
display(df_prepared.head())

Encoding labels in 'label'. Mapping created: {'CG': 0, 'OR': 1}


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,0,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,0,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,0,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,0,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,0,Very nice set. Good quality. We have had the s...


In [5]:
X = df_prepared[['text_']] 
y = df_prepared['label']
X_train_df, X_test_df, y_train, y_test = train_test_split(
        X, y,
        test_size=0.25,
        random_state=42, 
    )

In [6]:
pipeline_builder = PreprocessingPipelineBuilder(df_prepared)
cleaning_methods = [
        'lower',              # Приведение к нижнему регистру
        'remove_punctuation', # Удаление знаков препинания
        'remove_numbers',     # Удаление чисел
        'remove_whitespace',  # Сжатие пробелов
        'remove_stopwords'    # Удаление стоп-слов
    ]


In [7]:
text_preprocessing_pipeline = pipeline_builder.build_text_preprocessing_pipeline(
            text_column='text_', 
            vectorizer_method='tfidf',
            clean_text_methods=cleaning_methods,
            clean_text_stopwords_lang='english', 
            clean_text_lemmatize=True,    
            clean_text_stem=False,         
            max_features=5000,   # Ограничение на количество признаков (слов/н-грам)
            min_df=2,            # Игнорировать термины, которые встречаются менее чем в 2 документах
            max_df=0.95,         # Игнорировать термины, которые встречаются более чем в 95% документов
            ngram_range=(1, 2)   # Учитывать унарные и биграммы
        )


Building text preprocessing pipeline for column 'text_'...
Text preprocessing pipeline built.


In [8]:
for i, (name, transformer) in enumerate(text_preprocessing_pipeline.steps):
    print(f"{i+1}. {name}: {type(transformer).__name__}")
text_preprocessing_pipeline.fit(X_train_df, y_train)


1. text_cleaner: TextCleanerTransformer
2. vectorizer: TfidfVectorizer


0,1,2
,steps,"[('text_cleaner', ...), ('vectorizer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,text_column,'text_'
,methods,"['lower', 'remove_punctuation', ...]"
,stop_words_lang,'english'
,lemmatize,True
,stem,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [9]:
X_train_vectorized = text_preprocessing_pipeline.transform(X_train_df)
X_test_vectorized = text_preprocessing_pipeline.transform(X_test_df)

In [10]:
text_preprocessing_pipeline.fit(X_train_df, y_train)
fitted_vectorizer = text_preprocessing_pipeline.named_steps['vectorizer']

feature_names = fitted_vectorizer.get_feature_names_out()

print(f"Количество признаков после векторизации: {len(feature_names)}")
print(f"Примеры признаков: {feature_names[:10]}")
feature_importance = np.array(X_train_vectorized.sum(axis=0)).flatten()

top_n_features = 30 
if len(feature_importance) > 0:
     top_features_idx = feature_importance.argsort()[-top_n_features:][::-1]

     print(f"\nТоп-{top_n_features} самых 'активных' признаков (по сумме весов):")
     for idx in top_features_idx:
         print(f"{feature_names[idx]}: {feature_importance[idx]:.2f}")

Количество признаков после векторизации: 5000
Примеры признаков: ['aa' 'ability' 'able' 'able find' 'able get' 'able put' 'able use'
 'abrupt' 'absolute' 'absolutely']

Топ-30 самых 'активных' признаков (по сумме весов):
book: 906.76
love: 862.25
great: 858.62
good: 728.11
one: 722.17
story: 579.38
well: 559.61
read: 537.30
like: 532.69
would: 496.63
little: 486.48
work: 466.39
movie: 465.00
bought: 425.18
get: 419.16
fit: 408.75
character: 408.33
nice: 407.69
dog: 393.92
really: 390.56
use: 386.29
time: 384.78
small: 371.41
quality: 369.98
product: 362.39
easy: 357.80
size: 354.58
also: 337.79
recommend: 330.58
year: 317.99


In [11]:
from common_lib.models.baseline_models import BaselineModelTrainer
trainer = BaselineModelTrainer()
supported_models = trainer.get_supported_models()
print("\nПоддерживаемые модели в BaselineModelTrainer:")
display(supported_models)


Поддерживаемые модели в BaselineModelTrainer:


['logistic_regression', 'naive_bayes', 'linear_svc']

In [12]:
target_names = None
results = {} 
if 'label_mapping' in locals() and label_mapping:
     reverse_label_mapping = {v: k for k, v in label_mapping.items()}
     all_encoded_labels = sorted(list(set(y_train.tolist() + y_test.tolist())))
     try:
         target_names = [reverse_label_mapping[i] for i in all_encoded_labels]
         print(f"Используются имена классов из маппинга: {target_names}")
     except KeyError:
         print("Предупреждение: Некоторые закодированные метки не найдены в label_mapping. Используются числовые метки по умолчанию.")
         target_names = None 

Используются имена классов из маппинга: ['CG', 'OR']


In [13]:
for model_name in supported_models: 
    y_pred, model = trainer.train_and_predict(model_name, X_train_vectorized, y_train, X_test_vectorized)
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    results[model_name] = report
    print(classification_report(y_test, y_pred, target_names=target_names))

--- Training logistic_regression ---
Using model parameters: {'class_weight': 'balanced', 'random_state': 42, 'max_iter': 1000, 'solver': 'liblinear'}
Training complete.
Making predictions on the test set...
              precision    recall  f1-score   support

          CG       0.90      0.89      0.89      5093
          OR       0.89      0.90      0.89      5010

    accuracy                           0.89     10103
   macro avg       0.89      0.89      0.89     10103
weighted avg       0.89      0.89      0.89     10103

--- Training naive_bayes ---
Using model parameters: {'alpha': 1.0}
Training complete.
Making predictions on the test set...
              precision    recall  f1-score   support

          CG       0.89      0.84      0.87      5093
          OR       0.85      0.89      0.87      5010

    accuracy                           0.87     10103
   macro avg       0.87      0.87      0.87     10103
weighted avg       0.87      0.87      0.87     10103

--- Training 

In [14]:
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ('preprocessing', text_preprocessing_pipeline),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=2000, solver='liblinear'))
])


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    full_pipeline,
    param_grid,
    cv=5,
    scoring='f1_weighted',
    verbose=1,
    n_jobs=-1 
)

grid_search.fit(X_train_df, y_train)

best_model = grid_search.best_estimator_
print(f"\nЛучший параметр C: {grid_search.best_params_}")
print(f"Лучший F1-score на кросс-валидации: {grid_search.best_score_:.4f}")

y_pred_tuned = best_model.predict(X_test_df)
report_tuned = classification_report(y_test, y_pred_tuned, target_names=target_names, output_dict=True)
results['logistic_regression_tuned'] = report_tuned

print("\nОтчет по классификации для Logistic Regression с лучшими параметрами:")
print(classification_report(y_test, y_pred_tuned, target_names=target_names))

Fitting 5 folds for each of 5 candidates, totalling 25 fits

Лучший параметр C: {'classifier__C': 10}
Лучший F1-score на кросс-валидации: 0.8884

Отчет по классификации для Logistic Regression с лучшими параметрами:
              precision    recall  f1-score   support

          CG       0.90      0.90      0.90      5093
          OR       0.89      0.90      0.90      5010

    accuracy                           0.90     10103
   macro avg       0.90      0.90      0.90     10103
weighted avg       0.90      0.90      0.90     10103



In [15]:
best_full_pipeline_config = grid_search.best_estimator_
best_full_pipeline_config.fit(X_train_df, y_train)


0,1,2
,steps,"[('preprocessing', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('text_cleaner', ...), ('vectorizer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,text_column,'text_'
,methods,"['lower', 'remove_punctuation', ...]"
,stop_words_lang,'english'
,lemmatize,True
,stem,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,2000


In [16]:
from common_lib.visualization import Vizualizator

viz = Vizualizator(best_model, fitted_vectorizer, label_mapping)
viz.print_top_words()

AttributeError: 'Pipeline' object has no attribute 'coef_'

In [None]:
import joblib
model_data = {
    'model': best_model,
    'pipeline': text_preprocessing_pipeline,
    'label_mapping': label_mapping
}

joblib.dump(model_data, '../trained_models/model_complete.pkl')
print("Модель, pipeline и маппинг меток сохранены в model_complete.pkl")

In [17]:
import joblib
import os

model_artifacts = {
    'full_pipeline': best_full_pipeline_config,
    'label_mapping': label_mapping
}

save_path = '../trained_models/model_artifacts.pkl'
os.makedirs(os.path.dirname(save_path), exist_ok=True)

joblib.dump(model_artifacts, save_path)

print(f"Артефакты (единый pipeline и маппинг) сохранены в {save_path}")

Артефакты (единый pipeline и маппинг) сохранены в ../trained_models/model_artifacts.pkl


In [18]:
label_mapping

{'CG': 0, 'OR': 1}