In [47]:
from sklearn.datasets import load_files
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import pickle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [39]:
dataset = load_files("../data/train/", categories=['pos', 'neg'], encoding='utf-8')
X, y = dataset.data, dataset.target
df = pd.DataFrame({'review': X, 'sentiment': y})
df.head()

Unnamed: 0,review,sentiment
0,"Zero Day leads you to think, even re-think why...",1
1,Words can't describe how bad this movie is. I ...,0
2,Everyone plays their part pretty well in this ...,1
3,There are a lot of highly talented filmmakers/...,0
4,I've just had the evidence that confirmed my s...,0


In [40]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X = tfidf_vectorizer.fit_transform(df['review'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Baseline - LogisticRegression

In [41]:
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)
y_pred = logistic_regression_model.predict(X_test)
y_proba = logistic_regression_model.predict_proba(X_test)[:, 1]
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f'ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}')
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

Accuracy: 0.8760
ROC-AUC: 0.9506
              precision    recall  f1-score   support

    Negative       0.89      0.86      0.87      2482
    Positive       0.87      0.89      0.88      2518

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [45]:
with open('../models/logistic_model.pkl', 'wb') as model_file:
    pickle.dump(logistic_regression_model, model_file)

with open('../models/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)

Try to make some other models:
RandomForest and GradientBoostingClassifier

In [48]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
models = {'Random Forest': rf_model, 'Gradient Boosting': gb_model}

for model_name, model in models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f'ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}')

Model: Random Forest
Accuracy: 0.8414
ROC-AUC: 0.9174
Model: Gradient Boosting
Accuracy: 0.8062
ROC-AUC: 0.8883


Оценки этих моделей ниже чем у регрессии. Это может быть связано с тем, что данные модели требуют тщательного подбора гиперпараметров, а возможно данные достаточно простые, обладают линейной зависимостью и использование данных моделей излишне.