# Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

with open("files/texts_train.txt") as f:
    texts_train = f.readlines()
texts_train = [x.strip() for x in texts_train]

with open("files/scores_train.txt") as f:
    y_train = f.readlines()
y_train = [x.strip() for x in y_train]



In [None]:
vect = CountVectorizer(ngram_range=(2, 2))
x_train = vect.fit_transform(texts_train)
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(2, 2)],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}]

grid = GridSearchCV(Pipeline([
    ('vect', tfidf), 
    ('clf', LogisticRegression(random_state=0))]), 
                    param_grid,
                    scoring='accuracy', 
                    cv=5,
                    verbose=1,
                    n_jobs=-1)
grid.fit(x_train, y_train)
print('Best parameter set: ' + str(grid.best_params_))
print('Best accuracy: %.3f' + grid.best_score_)

In [None]:
with open("files/dataset_40757_1.txt") as f:
    texts_test = f.readlines()
texts_test = [x.strip() for x in texts_test]
clf = lr = grid.best_estimator_
clf.fit(x_train, y_train)
preds = clf.predict(texts_test)

for i in range(len(texts_test)):
    print(preds[i])

## Logistic Regression with Preprocessing and Stemming

In [None]:
import re
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import RussianStemmer

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('russian')


def preprocessor(text):
    emoticons = re.findall('(?::?|;|=)(?:-)?(?:\)+|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))

    return text


stemmer = RussianStemmer()


def tokenizer(text):
    return text.split()


def tokenizer_snowball(text):
    return [stemmer.stem(word) for word in text.split()]


param_grid = [{'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_snowball, ngram_range=(2, 2), stop_words=stop)),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
with open("files/texts_train.txt") as f:
    texts_train = f.readlines()
texts_train = [x.strip() for x in texts_train]

with open("files/scores_train.txt") as f:
    y_train = f.readlines()
y_train = [x.strip() for x in y_train]

gs_lr_tfidf.fit(texts_train, y_train)

In [None]:
clf = gs_lr_tfidf.best_estimator_


with open("files/dataset_40757_1.txt") as f:
    texts_test = f.readlines()
texts_test = [x.strip() for x in texts_test]
preds = clf.predict(texts_test)

for i in range(len(texts_test)):
    print(preds[i])

# Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import RussianStemmer
from sklearn.model_selection import GridSearchCV
import re
import nltk
import numpy as np
from nltk.corpus import stopwords

nltk.download('stopwords')
stop = stopwords.words('russian')

lr = LogisticRegression(random_state=1)

with open("files/texts_train.txt") as f:
    texts_train = f.readlines()
texts_train = [x.strip() for x in texts_train]

with open("files/scores_train.txt") as f:
    y_train = f.readlines()
y_train = np.asarray([x.strip() for x in y_train]).astype(np.float32)

with open("files/dataset_40757_1.txt") as f:
    texts_test = f.readlines()
texts_test = [x.strip() for x in texts_test]


def preprocessor(text):
    emoticons = re.findall('(?::?|;|=)(?:-)?(?:\)+|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))

    return text


stemmer = RussianStemmer()


def tokenizer(text):
    return text.split()


def tokenizer_snowball(text):
    return [stemmer.stem(word) for word in text.split()]


pipeline = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 3), preprocessor=preprocessor)),
        ('tfidf', TfidfTransformer()),
        ('clf',  SGDRegressor(random_state=0))]
)


In [None]:
texts_train = [w.lower() for w in texts_train]
learner = pipeline.fit(texts_train, y_train)


In [None]:
with open("files/dataset_40757_1.txt") as f:
    texts_test = f.readlines()
texts_test = [x.strip() for x in texts_test]
preds = learner.predict(texts_test)
for val in preds:
    print(int(round(val)))

# fastText

In [None]:
import fasttext

hyper_params = { 
    "lr": 0.1,         # Learning rate
    "epoch": 100,       # Number of training epochs to train for
    "wordNgrams": 2,    # Number of word n-grams to consider during training
    "minn": 2, 
    "maxn": 5
}

model = fasttext.train_supervised(input="files/fileresults.txt", **hyper_params)

In [None]:
with open("files/dataset_40757_1.txt") as f:
    texts_test = f.readlines()
texts_test = [x.strip() for x in texts_test]
for text in texts_test:
    print(model.predict(text))

# Dostoevsky

In [None]:
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
import numpy as np

tokenizer = RegexTokenizer()

model = FastTextSocialNetworkModel(tokenizer=tokenizer)

with open("files/texts_train.txt") as f:
    texts_train = f.readlines()
texts_train = [x.strip() for x in texts_train]

with open("files/scores_train.txt") as f:
    y_train = f.readlines()
y_train = np.asarray([x.strip() for x in y_train]).astype(np.float32)

with open("files/dataset_40757_1.txt") as f:
    texts_test = f.readlines()
texts_test = [x.strip() for x in texts_test]

results = model.predict(texts_train, k=2)
labels = ['label', 'negative', 'neutral', 'positive']
with open("files/dosto.txt", "w") as f:
    f.write("label\tnegative\tneutral\tpositive\n")
    for i in range(len(results)):
        sentiment = results[i]
        for label in labels:
            if label == 'label':
                f.write(f"{y_train[i]}\t")
            elif label in sentiment:
                f.write(f"{sentiment[label]}\t")
            else:
                f.write("0.0\t")
        f.write("\n")        

In [None]:
results = model.predict(texts_test, k=2)
labels = ['negative', 'neutral', 'positive']
with open("files/dosto_test.txt", "w") as f:
    f.write("negative\tneutral\tpositive\n")
    for i in range(len(results)):
        sentiment = results[i]
        for label in labels:
            if label in sentiment:
                f.write(f"{sentiment[label]}\t")
            else:
                f.write("0.0\t")
        f.write("\n")        

In [None]:
import pandas as pd
dosto = pd.read_csv("files/dosto.csv")
dosto_test = pd.read_csv("files/dosto_test.csv")

y_train = np.asarray(dosto['label'])
x_train = np.asarray(dosto.loc[:, dosto.columns != 'label'])

In [None]:
from sklearn.linear_model import SGDRegressor

regressor = SGDRegressor()
regressor.fit(x_train, y_train)

In [None]:
res = regressor.predict(np.asarray(dosto_test))
for r in res:
    print(int(round(r)))

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

regressor = MLPRegressor()
regressor.fit(x_train, y_train)
res = regressor.predict(np.asarray(dosto_test))
for r in res:
    print(int(round(r)))