In [10]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import spacy
import ru_core_news_lg

from tqdm import tqdm
tqdm.pandas()

In [12]:
class Scorer:
    
    def __init__(self):
        self.pipe_count_bigrams = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
                            ('lr', LogisticRegression())])
        self.pipe_count_lr = Pipeline([('lr', LogisticRegression())])
        self.pipe_svc = Pipeline([('svc', LinearSVC(penalty='l1', C=0.55, fit_intercept=False, dual=False, tol=1e-10, max_iter=100000))])
        
    def data_in_sentences(self, df, labels):
        cross_score_bigrams = cross_val_score(
            self.pipe_count_bigrams, df, labels, scoring='accuracy', cv=5, n_jobs=-1)
        print('(Count Bigram + Unigram Vectorizer + LR) mean score: ', cross_score_bigrams.mean())

In [13]:
def binarize(x):
    return 0 if x == 'negative' else 1

train_df = pd.read_csv('clothing.csv', sep='\t')
drop_index = train_df[train_df['sentiment'] == 'neautral'].index
train_df.drop(drop_index, inplace=True)
train_df.reset_index(drop=True)
# print(train_df)
X_train, y_train = train_df['review'], train_df['sentiment']
y_train = y_train.apply(binarize)
scorer = Scorer()_
scorer.data_in_sentences(X_train, y_train)  # without preprocessing

(Count Bigram + Unigram Vectorizer + LR) mean score:  0.9391333333333334


In [None]:
pipe_count_bigrams = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
                            ('lr', LogisticRegression(n_jobs=-1))])
parameters = {'vectorizer__min_df': [0, 0.05, 0.1, 0.2],
              'vectorizer__max_df': [1, 0.95, 0.9],
              'lr__penalty': ['l1', 'l2'],
              'lr__C': [1, 0.9, 0.5, 0.05]
              }
grid = GridSearchCV(pipe_count_bigrams, parameters, cv=5, verbose=10)
grid.fit(X_train, y_train)

In [26]:
cross_val_score(
            grid.best_estimator_, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1).mean()

0.9391333333333334

In [31]:
import pickle

best_pipe = grid.best_estimator_
best_pipe.fit(X_train, y_train)

with open("model.pkl", "wb") as f:
    pickle.dump(best_pipe, f)