In [1]:
import psycopg2
import numpy as np
from psycopg2.sql import SQL, Identifier

psql_db_name = 'text_classification_db'
psql_table_prefix = 'vk_wall_posts'
psql_user = 'zhenek'
psql_pass = '1'
psql_host = '/var/run/postgresql/'
max_data_size = 4000

def get_data_from_db(groups):
    target = []
    data = []
    for index, group in enumerate(groups):
        limit = max_data_size // len(group)
        conn = psycopg2.connect(dbname=psql_db_name, user=psql_user, host=psql_host)
        cur = conn.cursor()
        for owner_id in group:
            table_name = psql_table_prefix + '_' + str(owner_id)
            cur.execute(SQL("SELECT text FROM {} LIMIT %s;").format(Identifier(table_name)), (limit, ))
            rows = cur.fetchall()
            for row in rows:        
                data.append(row[0])
            target += [index] * len(rows)
    #for owner_id in owner_ids:
    #    table_name = psql_table_prefix + '_' + str(owner_id)
    #    cur.execute(SQL("SELECT COUNT(*) FROM {};").format(Identifier(table_name)))
    #    limit = min(limit, cur.fetchone()[0])
    #target = []
    #for i in range(0, len(owner_ids)):
    #    target += [i] * limit
    cur.close()
    conn.close()
    return data, target


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

categories = ['travel', 'animals', 'sport', 'style']
data, target = get_data_from_db([[51045049, 47951388, 55045888], [115357087, 71785575, 53388683], 
                                 [48303580, 121344058, 32894860, 126967384, 160506183], [24396213, 43460592]])
for idx, elem in enumerate(categories):
    print(elem + ' count: ' + str(target.count(idx)))
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.33, random_state=42)
text_clf = Pipeline([
    ('vect', CountVectorizer(token_pattern=r"(?u)\b[а-яА-Яa-zA-Z]{3,}|\B#[а-яА-Яa-zA-Z]{3,}\b")),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=1e-3)),
])
text_clf.fit(data_train, target_train)
predictTrain = text_clf.predict(data_train)
predictTest = text_clf.predict(data_test)
print("train accuracy:", np.mean(predictTrain == target_train))
print("test accuracy:", np.mean(predictTest == target_test))

docs_new = ['Кот', 'Бразилия страна',
            'Леопард', 'самая счастливая собака в мире',
            'Лев царь зверей.', "Футболист сборной России", 
            "Мы недавно приехали с Бали.", "Купил себе модный свитер"]
predicted_test_sample = text_clf.predict(docs_new)



for doc, category in zip(docs_new, predicted_test_sample):
    print('%r => %s' % (doc, categories[category]))

travel count: 3432
animals count: 3107
sport count: 3280
style count: 2824
train accuracy: 0.9415584415584416
test accuracy: 0.8547807332854062
'Кот' => animals
'Бразилия страна' => travel
'Леопард' => animals
'самая счастливая собака в мире' => animals
'Лев царь зверей.' => animals
'Футболист сборной России' => sport
'Мы недавно приехали с Бали.' => travel
'Купил себе модный свитер' => style


In [12]:
from sklearn import metrics
print(metrics.classification_report(target_test, predictTest,
    target_names=categories))
print(metrics.confusion_matrix(target_test, predictTest))

              precision    recall  f1-score   support

      travel       0.88      0.76      0.82      1128
     animals       0.91      0.82      0.86      1049
       sport       0.75      0.94      0.83      1059
       style       0.93      0.91      0.92       937

   micro avg       0.85      0.85      0.85      4173
   macro avg       0.87      0.86      0.86      4173
weighted avg       0.86      0.85      0.86      4173

[[860  50 202  16]
 [ 79 860  89  21]
 [ 17  19 998  25]
 [ 23  15  50 849]]


In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(data_train, target_train)

print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.8639934557298302
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)
