In [None]:
import gzip
import nltk
import string
import numpy as np

from pathlib import Path
from sklearn.pipeline import Pipeline
from collections import Counter, defaultdict
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import sent_tokenize, wordpunct_tokenize

# Data

In [None]:
X = np.array(Path('reviews.txt').open().readlines())
y = np.array([int(it) for it in Path('marks.txt').open().readlines()])

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.02, random_state=0)
train_index, val_index = list(sss.split(X, y))[0]

X_train, y_train = X[train_index], y[train_index]
X_val, y_val = X[val_index], y[val_index]

# Tools

In [None]:
def fix_y(y: [float]) -> [int]:
    return [int(round(max(1, min(10, it)))) for it in y]


def loss(y_true: [int], y_pred: [int]):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [None]:
PUNKT = set(string.punctuation + "«»№_—")
STOP_WORDS = set(stopwords.words("russian"))

In [None]:
class StemmedCountVectorizer(CountVectorizer):
    stemmer = RussianStemmer("stemmer")
    
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([self.stemmer.stem(word) for word in analyzer(doc)])
    

class StemmedTfidfVectorizer(TfidfVectorizer):
    stemmer = RussianStemmer("stemmer")
    
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: ([self.stemmer.stem(word) for word in analyzer(doc)])

In [None]:
def train_and_test_pipeline(X_train, y_train, X_val, y_val, pipeline, reg=False):
    pipeline.fit(X_train, y_train)
    
    y_train_pred = pipeline.predict(X_train)
    y_val_pred = pipeline.predict(X_val)
    
    if reg:
        y_train_pred = fix_y(y_train_pred)
        y_val_pred = fix_y(y_val_pred)

    print(f'Train RMSE = {loss(y_train, y_train_pred):0.2f}')
    print(f'Validate RMSE = {loss(y_val, y_val_pred):0.2f}')

# Classification approaches

## Naive Bayes

### Multinomial

In [None]:
text_clf = Pipeline([
    ('vect', StemmedCountVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('clf', MultinomialNB())
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_clf)

### Bernoulli

In [None]:
text_clf = Pipeline([
    ('vect', StemmedCountVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('clf', BernoulliNB())
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_clf)

## SGDClassifier

### CountVectorizer

In [None]:
text_clf = Pipeline([
    ('vect', StemmedCountVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('clf', SGDClassifier(loss='squared_loss', 
                          penalty='elasticnet', 
                          class_weight='balanced',
                          random_state=0, 
                          max_iter=5,
                          n_jobs=4,
                          tol=None))
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_clf)

### TfidfVectorizer

In [None]:
text_clf = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('clf', SGDClassifier(loss='hinge', 
                          penalty='elasticnet', 
                          alpha=1e-3, 
                          random_state=0, 
                          max_iter=5,
                          n_jobs=4,
                          tol=None))
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_clf)

## GradientBoostingClassifier

### TfidfVectorizer

In [None]:
text_clf = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('clf', GradientBoostingClassifier(subsample=0.8, learning_rate=0.05, n_estimators=200))
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_clf)

## RandomForestClassifier

In [None]:
text_clf = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('clf', RandomForestClassifier(n_estimators=20, n_jobs=10))
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_clf)

# Regression approaches

## LinearRegression

### CountVectorizer

In [None]:
text_reg = Pipeline([
    ('vect', StemmedCountVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('reg', LinearRegression())
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_reg, reg=True)

### TfidfVectorizer

In [None]:
text_reg = Pipeline([
    ('vect', StemmedTfidfVectorizer(stop_words=STOP_WORDS, min_df=0.002, max_df=0.8)),
    ('reg', LinearRegression())
])

In [None]:
train_and_test_pipeline(X_train, y_train, X_val, y_val, text_reg, reg=True)