In [53]:
import psycopg2
import numpy as np
from psycopg2.sql import SQL, Identifier

psql_db_name = 'text_classification_db'
psql_table_prefix = 'vk_wall_posts'
psql_user = 'postgres'
psql_pass = '1'
psql_host = '/var/run/postgresql/'
max_data_size = 4000

def get_data_from_db(groups):
    target = []
    data = []
    for index, group in enumerate(groups):
        limit = max_data_size // len(group)
        conn = psycopg2.connect(dbname=psql_db_name, user=psql_user, password=psql_pass)
        cur = conn.cursor()
        for owner_id in group:
            table_name = psql_table_prefix + '_' + str(owner_id)
            cur.execute(SQL("SELECT text FROM {} LIMIT %s;").format(Identifier(table_name)), (limit, ))
            rows = cur.fetchall()
            for row in rows:        
                data.append(row[0])
            target += [index] * len(rows)
    #for owner_id in owner_ids:
    #    table_name = psql_table_prefix + '_' + str(owner_id)
    #    cur.execute(SQL("SELECT COUNT(*) FROM {};").format(Identifier(table_name)))
    #    limit = min(limit, cur.fetchone()[0])
    #target = []
    #for i in range(0, len(owner_ids)):
    #    target += [i] * limit
    cur.close()
    conn.close()
    return data, target


In [67]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

categories = ['travel', 'animals', 'sport', ]
data, target = get_data_from_db([[51045049, 47951388, 55045888], [115357087, 71785575, 53388683], 
                                 [48303580, 121344058, 32894860, 126967384, 160506183]])
for idx, elem in enumerate(categories):
    print(target.count(idx))
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.33, random_state=42)
text_clf = Pipeline([
    ('vect', CountVectorizer(token_pattern=r"(?u)\b[а-яА-Яa-zA-Z]{3,}|\B#[а-яА-Яa-zA-Z]{3,}\b")),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=1e-3)),
])
text_clf.fit(data_train, target_train)
predictTrain = text_clf.predict(data_train)
predictTest = text_clf.predict(data_test)
print("train accuracy:", np.mean(predictTrain == target_train))
print("test accuracy:", np.mean(predictTest == target_test))

docs_new = ['Кот', 'Бразилия страна',
            'Леопард', 'самая счастливая собака в мире',
           'Знаешь, Мэри, в моей голове звери.', 'Лев царь зверей.', "Футболист сборной России", 
            "Мы недавно приехали с Бали.", "Джус любит, когда ему чешут пузико"]
predicted_test_sample = text_clf.predict(docs_new)

#pred_proba = text_clf.predict_proba(docs_new)
#print(pred_proba)

dec_func = text_clf.decision_function(docs_new)
print(dec_func)

for doc, category in zip(docs_new, predicted_test_sample):
    print('%r => %s' % (doc, categories[category]))

3431
3107
3280
train accuracy: 0.9341745211310428
test accuracy: 0.8697530864197531
[[-1.10497962  0.24718458 -1.10593877]
 [-0.25837117 -1.10932048 -1.05108232]
 [-1.04908925 -0.25528403 -0.99516233]
 [-0.80410168  0.10395546 -1.34789398]
 [-0.78199122 -0.96159909 -0.81840848]
 [-0.99274445 -0.56527986 -1.01347254]
 [-1.27784829 -1.40149069  1.33657665]
 [-0.37291326 -0.90345218 -1.04289881]
 [-1.09410339 -0.24471993 -0.81728691]]
'Кот' => animals
'Бразилия страна' => travel
'Леопард' => animals
'самая счастливая собака в мире' => animals
'Знаешь, Мэри, в моей голове звери.' => travel
'Лев царь зверей.' => animals
'Футболист сборной России' => sport
'Мы недавно приехали с Бали.' => travel
'Джус любит, когда ему чешут пузико' => animals


In [66]:
from sklearn import metrics
print(metrics.classification_report(target_test, predictTest,
    target_names=categories))
print(metrics.confusion_matrix(target_test, predictTest))

              precision    recall  f1-score   support

      travel       0.93      0.72      0.81      1142
     animals       0.91      0.84      0.87      1031
       sport       0.93      0.66      0.77      1092
        food       0.65      0.97      0.78      1295

   micro avg       0.81      0.81      0.81      4560
   macro avg       0.85      0.80      0.81      4560
weighted avg       0.84      0.81      0.81      4560

[[ 827   55   38  222]
 [  47  871   10  103]
 [   7   20  719  346]
 [  11   15    7 1262]]


In [56]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(data_train, target_train)

print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.7990164738632656
clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)
