In [26]:
import pandas as pd
from source.data import DataLoader, DataExplorer, DataPreparer
from sklearn.model_selection import train_test_split


In [2]:
loader = DataLoader()

In [3]:
mexwell_set={"link":"mexwell/fake-reviews-dataset", "filename":"fake reviews dataset.csv"}

In [16]:
mexwell_df = loader.load_from_kaggle(**mexwell_set)

../data\mexwell\fake-reviews-dataset\fake reviews dataset.csv


In [17]:
explorer = DataExplorer(mexwell_df)

In [18]:
explorer.show_head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
5,Home_and_Kitchen_5,3.0,CG,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.
6,Home_and_Kitchen_5,5.0,CG,They are the perfect touch for me and the only...
7,Home_and_Kitchen_5,3.0,CG,These done fit well and look great. I love th...
8,Home_and_Kitchen_5,5.0,CG,"Great big numbers & easy to read, the only thi..."
9,Home_and_Kitchen_5,5.0,CG,My son loves this comforter and it is very wel...


In [19]:
explorer.print_summary(text_column="text_", label_column="label", rating_column="rating")

СВОДКА ПО ДАННЫМ
Размерность данных: (40432, 4)
Количество колонок: 4
Колонки: category, rating, label, text_

Пропущенные значения отсутствуют

АНАЛИЗ ТЕКСТА:
  Длина текста - мин: 24, макс: 2827, среднее: 351.27
  Топ-5 слов: the(157256), a(96886), i(96841), and(87616), it(68590)

РАСПРЕДЕЛЕНИЕ РЕЙТИНГОВ:
  1.0: 2155 (5.3%)
  2.0: 1967 (4.9%)
  3.0: 3786 (9.4%)
  4.0: 7965 (19.7%)
  5.0: 24559 (60.7%)

РЕЙТИНГИ ПО МЕТКАМ (%):
label      CG     OR  Всего
rating                     
1.0      5.26   5.40   5.33
2.0      4.76   4.97   4.86
3.0      9.66   9.07   9.36
4.0     19.39  20.01  19.70
5.0     60.94  60.55  60.74

РАСПРЕДЕЛЕНИЕ МЕТОК:
  CG: 20216 (50.0%)
  OR: 20216 (50.0%)



In [21]:
mexwell_preparer = DataPreparer(mexwell_df)
processed_preparer = mexwell_preparer.prepare_df(
    handle_missing_strategy='fill',
    handle_missing_columns=['text_', 'label'],
    handle_missing_fill_value={'text_': '', 'label': -1},
    drop_duplicates_subset=None,
    encode_label_col='label',
    clean_text_col='text_',
    clean_text_methods=['lower', 'remove_punctuation', 'remove_numbers', 'remove_whitespace', 'remove_stopwords'],
    clean_text_lemmatize=True
)
processed_df = processed_preparer.get_result()
label_map = processed_preparer.get_label_mapping()

Starting data preparation...
Handling missing values using strategy: 'fill'...
DataFrame shape after handling missing values: (40432, 4)
Encoding labels in 'label'. Mapping created: {'CG': 0, 'OR': 1}
Cleaning text column: 'text_'...
Text cleaning applied to 'text_'.
Data preparation complete.


In [32]:
label_map = processed_preparer.get_label_mapping()
target_names = [k for k, v in sorted(label_map.items(), key=lambda item: item[1])]

In [22]:
processed_df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,0,love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,0,love great upgrade original ive mine couple year
2,Home_and_Kitchen_5,5.0,0,pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,0,missing information use great product price
4,Home_and_Kitchen_5,5.0,0,nice set good quality set two month


In [24]:
X_tfidf, y_labels = processed_preparer.get_features_and_labels(
        text_column='text_',
        label_column='label',
        vectorizer_method='tfidf',
        max_features=5000
)

Vectorizing text from 'text_' using 'tfidf'...
Feature matrix shape: (40432, 5000)
Extracted labels from 'label'. Shape: (40432,)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
            X_tfidf,
            y_labels,
            test_size=0.2,
            random_state=42)

In [29]:
from source.models.baseline_models import BaselineModelTrainer

trainer = BaselineModelTrainer()
supported_models = trainer.get_supported_models()
supported_models

['logistic_regression', 'naive_bayes', 'linear_svc']

In [38]:
from sklearn.metrics import classification_report

trainer = BaselineModelTrainer()
results = {}

for model_name in trainer.get_supported_models():
    y_pred, model = trainer.train_and_predict(model_name, X_train, y_train, X_test)
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    results[model_name] = report
    print(classification_report(y_test, y_pred, target_names=target_names))

--- Training logistic_regression ---
Using model parameters: {'class_weight': 'balanced', 'random_state': 42, 'max_iter': 1000, 'solver': 'liblinear'}
Training complete.
Making predictions on the test set...
              precision    recall  f1-score   support

          CG       0.88      0.85      0.87      4016
          OR       0.86      0.89      0.87      4071

    accuracy                           0.87      8087
   macro avg       0.87      0.87      0.87      8087
weighted avg       0.87      0.87      0.87      8087

--- Training naive_bayes ---
Using model parameters: {'alpha': 1.0}
Training complete.
Making predictions on the test set...
              precision    recall  f1-score   support

          CG       0.83      0.87      0.85      4016
          OR       0.86      0.83      0.85      4071

    accuracy                           0.85      8087
   macro avg       0.85      0.85      0.85      8087
weighted avg       0.85      0.85      0.85      8087

--- Training 

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, solver='liblinear'),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    verbose=1,
    n_jobs=-1 
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"\nЛучший параметр C: {grid_search.best_params_}")
print(f"Лучший F1-score на кросс-валидации: {grid_search.best_score_:.4f}")

y_pred_tuned = best_model.predict(X_test)
report_tuned = classification_report(y_test, y_pred_tuned, target_names=target_names, output_dict=True)
results['logistic_regression_tuned'] = report_tuned

print("\nОтчет по классификации для Logistic Regression с лучшими параметрами:")
print(classification_report(y_test, y_pred_tuned, target_names=target_names))

Fitting 5 folds for each of 5 candidates, totalling 25 fits

Лучший параметр C: {'C': 10}
Лучший F1-score на кросс-валидации: 0.8707

Отчет по классификации для Logistic Regression с лучшими параметрами:
              precision    recall  f1-score   support

          CG       0.88      0.87      0.88      4016
          OR       0.88      0.88      0.88      4071

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087



### Анализ результатов
#### Лучшая модель: 
Судя по сводной таблице, наилучший результат по метрике F1-score (weighted) показала модель logistic_regression_tuned (значение 0.88) и linear_svc. Это подтверждает, что подбор гиперпараметров позволил улучшить качество по сравнению с базовой логистической регрессией.

#### Сравнение моделей: 
Все три базовые модели показали схожие результаты, что часто бывает на задачах классификации текста. LinearSVC немного опередила NaiveBayes и LogisticRegression.

#### Анализ по классам (посмотреть в classification_report): 
Метрики для классов 'positive' и 'negative' сбалансированы, что говорит об отсутствии явного перекоса в предсказаниях модели в сторону одного из классов. Если бы для одного класса recall был бы значительно ниже, это бы указывало на проблему.

### Вывод:
 Простые линейные модели отлично справляются с данной задачей. Оптимизация гиперпараметров дает дополнительный, хоть и не кардинальный, прирост качества. В качестве финальной модели для развертывания рекомендуется logistic_regression_tuned.

In [39]:
yelp_set={"link":"thedevastator/yelp-reviews-sentiment-dataset", "filename":"train.csv"}
flipkart_set={"link":"niraliivaghani/flipkart-product-customer-reviews-dataset", "filename":"Dataset-SA.csv"}
amazon_set={"link":"dongrelaxman/amazon-reviews-dataset", "filename":"Amazon_Reviews.csv"}
