In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import os

# Шаг 1. Загрузка данных
def load_data(directory):
    reviews = []
    labels = []
    for label in ['pos', 'neg']:
        folder = os.path.join(directory, label)
        for file in os.listdir(folder):
            with open(os.path.join(folder, file), 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(1 if label == 'pos' else 0)
    return reviews, labels

train_dir = 'train'
test_dir = 'test'

train_reviews, train_labels = load_data(train_dir)
test_reviews, test_labels = load_data(test_dir)

# Шаг 2. Векторизация текста
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 3), min_df=5, max_df=0.7)
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

# Шаг 3. Логистическая регрессия с GridSearchCV
param_grid_logreg = {
    'C': [0.1, 1, 10],
    'max_iter': [100, 200],
    'penalty': ['l2'],
    'solver': ['liblinear'],
    'class_weight': ['balanced']
}

logreg = LogisticRegression()
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_logreg.fit(X_train, train_labels)

best_logreg_model = grid_search_logreg.best_estimator_
logreg_predictions = best_logreg_model.predict(X_test)

print(f'Best parameters for Logistic Regression: {grid_search_logreg.best_params_}')
print(f'Accuracy with Logistic Regression: {accuracy_score(test_labels, logreg_predictions)}')


Best parameters for Logistic Regression: {'C': 1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy with Logistic Regression: 0.88128


In [9]:
import joblib

# Сохранение модели
joblib.dump(best_logreg_model, 'model.pkl')

# Сохранение векторизатора
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']