In [6]:
from sklearn.datasets import load_files
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import pickle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [7]:
dataset = load_files("../data/train/", categories=['pos', 'neg'], encoding='utf-8')
X, y = dataset.data, dataset.target
df = pd.DataFrame({'review': X, 'sentiment': y})
df.head()

Unnamed: 0,review,sentiment
0,"Zero Day leads you to think, even re-think why...",1
1,Words can't describe how bad this movie is. I ...,0
2,Everyone plays their part pretty well in this ...,1
3,There are a lot of highly talented filmmakers/...,0
4,I've just had the evidence that confirmed my s...,0


Проведем векторизацию текста с использованием TFIDF, а так же выделение слов положительной и отрицательной тональности с помощью SentimentIntensityAnalyzer()

In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(df['review'])
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def get_sentiment_scores(text):
    sentiment_scores = sia.polarity_scores(text)
    return [sentiment_scores['neg'], sentiment_scores['pos']]

sentiment_features = pd.DataFrame([get_sentiment_scores(review) for review in df['review']],
                                  columns=['neg', 'pos'])

X = pd.concat([tfidf_df, sentiment_features], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/serttyzar/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Baseline - LogisticRegression

In [9]:
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)
y_pred = logistic_regression_model.predict(X_test)
y_proba = logistic_regression_model.predict_proba(X_test)[:, 1]
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f'ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}')
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

Accuracy: 0.8794
ROC-AUC: 0.9480
              precision    recall  f1-score   support

    Negative       0.88      0.87      0.88      2482
    Positive       0.87      0.89      0.88      2518

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [11]:
with open('../models/logistic_model.pkl', 'wb') as model_file:
    pickle.dump(logistic_regression_model, model_file)
with open('../models/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)
with open('../models/sentiment_analyzer.pkl', 'wb') as sia_file:
    pickle.dump(sia, sia_file)

Try to make some other models:
RandomForest and GradientBoostingClassifier

In [6]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
models = {'Random Forest': rf_model, 'Gradient Boosting': gb_model}

for model_name, model in models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f'ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}')

Model: Random Forest
Accuracy: 0.8428
ROC-AUC: 0.9229
Model: Gradient Boosting
Accuracy: 0.8090
ROC-AUC: 0.8941


Оценки этих моделей ниже чем у регрессии. Это может быть связано с тем, что данные модели требуют тщательного подбора гиперпараметров, а возможно данные достаточно простые, обладают линейной зависимостью и использование данных моделей излишне.

Проверим регрессию и лес на тестовом датасете

In [7]:
dataset = load_files("../data/test/", categories=['pos', 'neg'], encoding='utf-8')
X_test_data, y_test_data = dataset.data, dataset.target
df_test = pd.DataFrame({'review': X_test_data, 'sentiment': y_test_data})
X_tfidf_test_data = tfidf_vectorizer.transform(df_test['review'])
tfidf_df_test_data = pd.DataFrame(X_tfidf_test_data.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
sentiment_features_test = pd.DataFrame([get_sentiment_scores(review) for review in df_test['review']],
                                  columns=['neg', 'pos'])
X_test_data = pd.concat([tfidf_df_test_data, sentiment_features_test], axis=1)

In [8]:
print(f"Model: Logistic Regression")
y_pred = logistic_regression_model.predict(X_test_data)
y_proba = logistic_regression_model.predict_proba(X_test_data)[:, 1]
print(f"Accuracy: {accuracy_score(y_test_data, y_pred):.4f}")
print(f'ROC-AUC: {roc_auc_score(y_test_data, y_proba):.4f}')
print(classification_report(y_test_data, y_pred, target_names=['Negative', 'Positive']))

print(f"Model: Random Forest")
y_pred = rf_model.predict(X_test_data)
y_proba = rf_model.predict_proba(X_test_data)[:, 1]
print(f"Accuracy: {accuracy_score(y_test_data, y_pred):.4f}")
print(f'ROC-AUC: {roc_auc_score(y_test_data, y_proba):.4f}')
print(classification_report(y_test_data, y_pred, target_names=['Negative', 'Positive']))

Model: Logistic Regression
Accuracy: 0.8753
ROC-AUC: 0.9467
              precision    recall  f1-score   support

    Negative       0.87      0.88      0.88     12500
    Positive       0.88      0.87      0.87     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Model: Random Forest
Accuracy: 0.8456
ROC-AUC: 0.9261
              precision    recall  f1-score   support

    Negative       0.83      0.86      0.85     12500
    Positive       0.86      0.83      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



Для лучшего обучения модели обучим ее на всех доступных данных с предобработкой текста

In [9]:
df_full = pd.concat([df, df_test], ignore_index=True)
df_full = df_full.drop_duplicates(subset='review', keep='first')
df_full.head()

Unnamed: 0,review,sentiment
0,"Zero Day leads you to think, even re-think why...",1
1,Words can't describe how bad this movie is. I ...,0
2,Everyone plays their part pretty well in this ...,1
3,There are a lot of highly talented filmmakers/...,0
4,I've just had the evidence that confirmed my s...,0


: 

In [None]:
y_full = df_full['sentiment']
tfidf_vectorizer_prod = TfidfVectorizer(stop_words='english', max_features=6000, max_df=0.9, min_df=5)
X_full_tfidf = tfidf_vectorizer_prod.fit_transform(df_full['review'])
tfidf_df_full = pd.DataFrame(X_full_tfidf.toarray(), columns=tfidf_vectorizer_prod.get_feature_names_out())

sentiment_features_full = pd.DataFrame([get_sentiment_scores(review) for review in df_full['review']],
                                       columns=['neg', 'pos'])

X_full_combined = pd.concat([tfidf_df_full, sentiment_features_full], axis=1)

logistic_regression_model_no_scale = LogisticRegression(max_iter=1000)
cv_scores_no_scale = cross_val_score(logistic_regression_model_no_scale, X_full_combined, y_full, cv=5, scoring='accuracy')
logistic_regression_model_no_scale.fit(X_full_combined, y_full)
print(f"Cross-validation accuracy scores (no scaling): {cv_scores_no_scale}")
print(f"Mean CV accuracy (no scaling): {cv_scores_no_scale.mean():.4f}")


scaler = StandardScaler()
X_full_combined_scaled = scaler.fit_transform(X_full_combined)
logistic_regression_model_with_scale = LogisticRegression(max_iter=1000)
cv_scores_with_scale = cross_val_score(logistic_regression_model_with_scale, X_full_combined_scaled, y_full, cv=5, scoring='accuracy')
logistic_regression_model_with_scale.fit(X_full_combined_scaled, y_full)
print(f"Cross-validation accuracy scores (with scaling): {cv_scores_with_scale}")
print(f"Mean CV accuracy (with scaling): {cv_scores_with_scale.mean():.4f}")