In [15]:
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier, LSHForest
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

In [3]:
def load_data():
    train = pd.read_csv('data/evo_train.csv.gz', compression='gzip', index_col='id')
    categories = pd.read_csv('data/categories_parsed.csv.gz', compression='gzip', index_col='GROUP_ID')
    data = train.join(categories, on='GROUP_ID')
    return data

In [4]:
data = load_data()
data.head()

Unnamed: 0_level_0,NAME,GROUP_ID,category,sub_category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"Пиво ""Жигулевское"" 0,5 л. св.",35,Алкоголь,"Пиво, слабоалкогольные напитки"
1,СОУС ТОМАТНЫЙ БУЗДЯК 670Г ТАТАРСКИЙ /8,6,Прод,Продукты питания
2,Сигареты Esse SS Exchange,7,Непрод,Табачные изделия
3,Петрушка,6,Прод,Продукты питания
4,пиво ягерь,35,Алкоголь,"Пиво, слабоалкогольные напитки"


In [5]:
def prepare_data_baseline(data):
    X = data['NAME']
    y = data['category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    test = pd.DataFrame(index=y_test.index)
    test['NAME'] = X_test
    test['category'] = y_test
    
    return X_train, X_test, y_train, y_test

In [6]:
from sklearn.metrics import accuracy_score, log_loss, classification_report

def show_metrics(y_true, y_pred, y_pred_proba):
    print("Accuracy: {}".format(accuracy_score(y_true, y_pred)))
    print("LogLoss: {}".format(log_loss(y_true, y_pred_proba)))
    print(classification_report(y_true, y_pred))

In [22]:
def baseline(data, prepare_data):
    X_train, X_test, y_train, y_test = prepare_data(data)
    vectorizer = CountVectorizer()
    pipeline = Pipeline([
            ('count_vectorizer', vectorizer),
#             ('best', TruncatedSVD(n_components=100)),
            ('clf_category', BaggingClassifier(MultinomialNB(), n_estimators=10, n_jobs=4))
        ])

    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_proba = pipeline.predict_proba(X_test)
    print("Vocabulary length: {}".format(len(vectorizer.vocabulary_)))
    show_metrics(y_test, pred, pred_proba)
    
    pred_df = pd.DataFrame(pred, index=X_test.index,columns=['predicted'])
    pred_df['real'] = y_test
    pred_df['NAME'] = X_test
    errors = pred_df[pred_df['real'] != pred_df['predicted']]
    
    return errors, vectorizer

In [23]:
errors_baseline, vectorizer = baseline(data, prepare_data_baseline)
print("Errors count: {}".format(errors_baseline.shape[0]))
errors_baseline.head()

Vocabulary length: 38416
Accuracy: 0.9661057405418307
LogLoss: 0.12057847973990445
                           precision    recall  f1-score   support

                 Алкоголь       0.98      0.96      0.97      3583
                   Непрод       0.96      0.99      0.98     11214
Позиция по свободной цене       0.00      0.00      0.00         4
                     Прод       0.95      0.84      0.89      1774
                      н/д       0.99      0.84      0.91       183

              avg / total       0.97      0.97      0.97     16758

Errors count: 568


  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,predicted,real,NAME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6561,Непрод,Алкоголь,"БИТОКСИБАЦИЛЛИН, П/Август/20гр"
48333,Непрод,Алкоголь,набор подарочный для вина
50666,Непрод,Прод,Бутерброд №44
43089,Непрод,Прод,Малена / Акконд
21696,Непрод,Алкоголь,Лавила Нанни сух.150


In [8]:
from nltk.corpus import stopwords as sw
import xgboost as xgb

def boosting(data, prepare_data):
    X_train, X_test, y_train, y_test = prepare_data(data)
    vectorizer = CountVectorizer()
    pipeline = Pipeline([
            ('count_vectorizer', vectorizer),
#             ('best', TruncatedSVD(n_components=200, n_iter=10, random_state=42)),
            ('clf_category', xgb.XGBClassifier(max_depth=15, seed=42, n_estimators=200))
        ])

    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_proba = pipeline.predict_proba(X_test)
    print("Vocabulary length: {}".format(len(vectorizer.vocabulary_)))
    show_metrics(y_test, pred, pred_proba)
    
    pred_df = pd.DataFrame(pred, index=X_test.index,columns=['predicted'])
    pred_df['real'] = y_test
    pred_df['NAME'] = X_test
    errors = pred_df[pred_df['real'] != pred_df['predicted']]
    
    return errors, vectorizer

In [None]:
errors, vectorizer = boosting(data, prepare_data_baseline)
print("Errors count: {}".format(errors.shape[0]))
errors.head()

In [18]:
def ensemble(data, prepare_data):
    X_train, X_test, y_train, y_test = prepare_data(data)
    vectorizer = CountVectorizer()
    pipeline = Pipeline([
            ('count_vectorizer', vectorizer),
            ('clf', VotingClassifier([('knn', KNeighborsClassifier()), ('nb', MultinomialNB()), ('xgb',xgb.XGBClassifier(max_depth=15, seed=42, n_estimators=200))], voting='soft'))
        ])

    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_proba = pipeline.predict_proba(X_test)
    print("Vocabulary length: {}".format(len(vectorizer.vocabulary_)))
    show_metrics(y_test, pred, pred_proba)
    
    pred_df = pd.DataFrame(pred, index=X_test.index,columns=['predicted'])
    pred_df['real'] = y_test
    pred_df['NAME'] = X_test
    errors = pred_df[pred_df['real'] != pred_df['predicted']]
    
    return errors, vectorizer

In [19]:
errors, vectorizer = ensemble(data, prepare_data_baseline)
print("Errors count: {}".format(errors.shape[0]))
errors.head()

Vocabulary length: 38416
Accuracy: 0.9541711421410669
LogLoss: 0.1564571272273075
                           precision    recall  f1-score   support

                 Алкоголь       0.99      0.94      0.96      3583
                   Непрод       0.94      0.99      0.97     11214
Позиция по свободной цене       1.00      0.50      0.67         4
                     Прод       0.97      0.74      0.84      1774
                      н/д       0.98      0.87      0.92       183

              avg / total       0.96      0.95      0.95     16758



NameError: name 'errors' is not defined

In [158]:
from nltk.stem.snowball import RussianStemmer

def predict_with_stemmer(data, prepare_data):
    stemmer = RussianStemmer()
    analyzer = CountVectorizer().build_analyzer()

    def stemmed_words(doc):
        return (stemmer.stem(w) for w in analyzer(doc))

    vectorizer = CountVectorizer(analyzer=stemmed_words, max_features=30000)
    
    X_train, X_test, y_train, y_test = prepare_data(data)
    pipeline = Pipeline([
            ('count_vectorizer', vectorizer),
            ('clf_category', MultinomialNB())
        ])

    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_proba = pipeline.predict_proba(X_test)
    print("Vocabulary length: {}".format(len(vectorizer.vocabulary_)))
    show_metrics(y_test, pred, pred_proba)
    
    pred_df = pd.DataFrame(pred, index=X_test.index,columns=['predicted'])
    pred_df['real'] = y_test
    pred_df['NAME'] = X_test
    errors = pred_df[pred_df['real'] != pred_df['predicted']]
    
    return errors, vectorizer

In [159]:
errors_stemmer, vectorizer = predict_with_stemmer(data, prepare_data_baseline)
print("Errors count: {}".format(errors_stemmer.shape[0]))
errors_stemmer.head()

Vocabulary length: 30000
Accuracy: 0.9644945697577276
LogLoss: 0.12459730473394144
                           precision    recall  f1-score   support

                 Алкоголь       0.97      0.96      0.97      3583
                   Непрод       0.97      0.99      0.98     11214
Позиция по свободной цене       0.00      0.00      0.00         4
                     Прод       0.94      0.85      0.89      1774
                      н/д       0.99      0.84      0.91       183

              avg / total       0.96      0.96      0.96     16758

Errors count: 595


  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,predicted,real,NAME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6561,Непрод,Алкоголь,"БИТОКСИБАЦИЛЛИН, П/Август/20гр"
48333,Непрод,Алкоголь,набор подарочный для вина
50666,Непрод,Прод,Бутерброд №44
43089,Непрод,Прод,Малена / Акконд
21696,Непрод,Алкоголь,Лавила Нанни сух.150


In [175]:
import string
import re

def prepare_data_numbers_fix(data):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    X = (data['NAME'] + '.') \
        .str.lower() \
        .replace(regex," ").str \
        .replace('\d+', '') \
        .str.replace(" л ", " литр ") \
        .str.replace(" г ", ' грамм ') \
        .str.replace(" кг ", ' килограмм ') \
        .str.replace(" мг ", ' миллиграмм ') \
        .str.replace(" мл ", ' миллилитр ') \
        .str.replace(" шт ", " штук ") \
        .str.replace(" см ", " сантиметр ") \
        .str.replace(" м ", " метр ") \
        .str.replace(' д ', ' для ')
    
    y = data['category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    test = pd.DataFrame(index=y_test.index)
    test['NAME'] = X_test
    test['category'] = y_test
    
    return X_train, X_test, y_train, y_test

In [176]:
errors_baseline_fixed, vectorizer = baseline(data, prepare_data_numbers_fix)
print("Errors count: {}".format(errors_baseline_fixed.shape[0]))
errors_baseline_fixed.head()

Vocabulary length: 21000
Accuracy: 0.9667621434538728
LogLoss: 0.11574519581400006
                           precision    recall  f1-score   support

                 Алкоголь       0.98      0.96      0.97      3583
                   Непрод       0.97      0.99      0.98     11214
Позиция по свободной цене       0.00      0.00      0.00         4
                     Прод       0.94      0.85      0.89      1774
                      н/д       0.99      0.84      0.91       183

              avg / total       0.97      0.97      0.97     16758

Errors count: 557


  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,predicted,real,NAME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6561,Непрод,Алкоголь,битоксибациллин п август гр
48333,Непрод,Алкоголь,набор подарочный для вина
43089,Непрод,Прод,малена акконд
49520,Прод,Непрод,кофе hausbrandt nero espresso
54820,Непрод,Алкоголь,°по фаренгейту р брэдбери


In [200]:
errors, vectorizer = boosting(data, prepare_data_numbers_fix)
print("Errors count: {}".format(errors.shape[0]))
errors.head()

Vocabulary length: 30945
Accuracy: 0.9328678839957035
LogLoss: 0.20956943042317075
                           precision    recall  f1-score   support

                 Алкоголь       0.99      0.90      0.95      3583
                   Непрод       0.93      0.98      0.95     11214
Позиция по свободной цене       1.00      0.75      0.86         4
                     Прод       0.84      0.70      0.76      1774
                      н/д       0.98      0.87      0.92       183

              avg / total       0.93      0.93      0.93     16758

Errors count: 1125


Unnamed: 0_level_0,predicted,real,NAME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6561,Прод,Алкоголь,битоксибациллин п август гр
48333,Непрод,Алкоголь,набор подарочный для вина
15024,Непрод,Прод,зефир воздушный поцелуй клубничный
50666,Непрод,Прод,бутерброд №
38717,Прод,Непрод,ушастый нянь weight


In [178]:
errors, vectorizer = predict_with_stemmer(data, prepare_data_numbers_fix)
print("Errors count: {}".format(errors.shape[0]))
errors.head()

Vocabulary length: 23457
Accuracy: 0.9634204559016589
LogLoss: 0.12338980122092645
                           precision    recall  f1-score   support

                 Алкоголь       0.97      0.96      0.96      3583
                   Непрод       0.97      0.99      0.98     11214
Позиция по свободной цене       0.00      0.00      0.00         4
                     Прод       0.93      0.84      0.89      1774
                      н/д       0.99      0.84      0.91       183

              avg / total       0.96      0.96      0.96     16758

Errors count: 613


  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,predicted,real,NAME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6561,Непрод,Алкоголь,битоксибациллин п август гр
49297,Алкоголь,Непрод,ценникодержатель полочный самоклеющийся dbr ...
48333,Непрод,Алкоголь,набор подарочный для вина
43089,Непрод,Прод,малена акконд
49520,Прод,Непрод,кофе hausbrandt nero espresso
