In [1]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

from nltk import SnowballStemmer, word_tokenize
from nltk.corpus import stopwords
from string import punctuation

import json
import pickle

C:\Users\artemii\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\artemii\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


In [2]:
russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))
all_stopwords = russian_stopwords.union(english_stopwords)

In [3]:
ru_stemmer = SnowballStemmer('russian', ignore_stopwords=True)
eng_stemmer = SnowballStemmer('english', ignore_stopwords=True)

In [4]:
def preprocess_text(text):
    tokenized = word_tokenize(text, language='russian')
    return [ru_stemmer.stem(eng_stemmer.stem(x.lower())) for x in tokenized if (x.lower() not in all_stopwords and len(x) > 1)]

In [5]:
data = pd.read_csv("./women-clothing-accessories.3-class.balanced.csv", sep='\t')

In [6]:
data = data[data.sentiment != 'neautral']

In [7]:
data.head()

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative


In [8]:
data['y'] = data.sentiment.apply(lambda x: 0 if x == 'negative' else 1)

In [9]:
data.sentiment.value_counts()

positive    30000
negative    30000
Name: sentiment, dtype: int64

In [10]:
train, val = train_test_split(range(len(data)), test_size=0.1, stratify=data.y.values)
train = data.iloc[train]
val = data.iloc[val]

In [11]:
tf_idf = TfidfVectorizer(analyzer=preprocess_text, min_df=4, max_df=0.05)

In [12]:
X_train = tf_idf.fit_transform(train.review.values)

In [13]:
X_val = tf_idf.transform(val.review.values)

### Linear Regression

In [14]:
linear = LinearRegression(n_jobs=-1)

In [15]:
linear.fit(X_train, train.y.values)

LinearRegression(n_jobs=-1)

In [16]:
pred = linear.predict(X_val)

In [17]:
scores = []
for thr in np.linspace(pred.min(), pred.max(), num=200):
    tmp_score = f1_score(val.y.values, (pred >= thr).astype(np.float32))
    scores.append((thr, tmp_score))

In [18]:
best_thr = max(scores, key=lambda x: x[1])
print(best_thr)
thr = best_thr[0]

(0.5002728138727601, 0.8638529611980938)


In [19]:
def infer_linear(text, mdl, tf_idf, thr):
    score = mdl.predict(tf_idf.transform([text]))
    res = (score >= thr).astype(np.float32)
    return "Отрицательный" if res == 0 else "Положительный"

In [20]:
def dump_preprocess(dir_path, tf_idf):
    with open(dir_path + "/tf_idf.pkl", 'wb') as f:
        pickle.dump(tf_idf, f)

In [21]:
def dump_linear(dir_path, linear, thr):
    with open(dir_path + "/linear.pkl", 'wb') as f:
        pickle.dump(linear, f)

    with open(dir_path + "/linear.config", "w+") as f:
        json.dump({"thr": thr}, f)

In [22]:
def load_preprocess(dir_path):
    import pickle
    
    with open(dir_path + "/tf_idf.pkl", 'rb') as f:
        tf_idf = pickle.load(f)
    return tf_idf

In [23]:
def load_linear(dir_path):
    import pickle
    import json
    
    with open(dir_path + "/linear.pkl", 'rb') as f:
        mdl = pickle.load(f)
    with open(dir_path + "/linear.config") as f:
        cfg = json.load(f)
    return mdl, cfg['thr']

In [24]:
dump_linear("./", linear, thr)

In [25]:
dump_preprocess("./", tf_idf)

### Logistic Regression

In [26]:
logistic = LogisticRegression(n_jobs=-1)

In [27]:
logistic.fit(X_train, train.y.values)

LogisticRegression(n_jobs=-1)

In [28]:
pred = logistic.predict_proba(X_val)

In [29]:
scores = []
for thr in np.linspace(pred[:, 1].min(), pred[:, 1].max(), num=200):
    tmp_score = f1_score(val.y.values, (pred[:, 1] >= thr).astype(np.float32))
    scores.append((thr, tmp_score))

In [30]:
best_thr = max(scores, key=lambda x: x[1])
print(best_thr)
thr = best_thr[0]

(0.5025742992013379, 0.8782578875171468)


In [31]:
def infer_logistic(text, mdl, tf_idf, thr):
    score = mdl.predict_proba(tf_idf.transform([text]))[:, 1]
    out = (score >= thr).astype(np.float32)
    return "Отрицательный" if out == 0 else "Положительный"

In [32]:
def dump_logistic(dir_path, mdl, thr):
    import pickle
    
    with open(dir_path + "/logistic.pkl", 'wb') as f:
        pickle.dump(mdl, f)
        
    with open(dir_path + "/logistic.config", "w+") as f:
        json.dump({"thr": thr}, f)

In [33]:
dump_logistic("./", logistic, thr)

In [34]:
def load_logistic(dir_path):
    import pickle
    import json
    
    with open(dir_path + "/logistic.pkl", 'rb') as f:
        mdl = pickle.load(f)
    
    with open(dir_path + "/logistic.config") as f:
        thr = json.load(f)['thr']
    
    return mdl, thr

In [35]:
infer_logistic("Мама мыла раму!", logistic, tf_idf, thr)

'Положительный'

### Deployment

In [36]:
import gradio as gr

In [37]:
from functools import partial

In [38]:
from nltk import SnowballStemmer, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import numpy as np

russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))
all_stopwords = russian_stopwords.union(english_stopwords)

ru_stemmer = SnowballStemmer('russian', ignore_stopwords=True)
eng_stemmer = SnowballStemmer('english', ignore_stopwords=True)

def preprocess_text(text):
    tokenized = word_tokenize(text, language='russian')
    return [ru_stemmer.stem(eng_stemmer.stem(x.lower())) for x in tokenized if (x.lower() not in all_stopwords and len(x) > 1)]

In [39]:
tf_idf = load_preprocess("./")

### Linear

In [40]:
linear, thr_lin = load_linear("./")

In [41]:
demo = gr.Interface(fn=partial(infer_linear, mdl=linear, tf_idf=tf_idf, thr=thr_lin), inputs="text", outputs="text")

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




### Logistic

In [42]:
logistic, thr_log = load_logistic("./")

In [43]:
demo = gr.Interface(fn=partial(infer_logistic, mdl=logistic, tf_idf=tf_idf, thr=thr_log), inputs="text", outputs="text")

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


