**Head**

In [131]:
import pandas as pd
import numpy as np
import pandas_profiling
import re
import sklearn
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression

**Cleaning**

In [132]:
df = pd.read_csv('trainset.csv', delimiter=';')
df.product_name = df.product_name.astype(str)

In [133]:
df['clean_product_name'] = df['product_name'].str.lower()
df['clean_product_name'] = df['clean_product_name'].str.replace(r'[,:\(\)"_|#№]', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'сиг([и]?|ар?(ет|илл?)[ыа])[а-я]*', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'мрц\s?\d*[,.-=-]*\d*\s?', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'(?<=[^0-9])[\_\*\'\"\.,]', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'\d?\d?\s?\(?ш[т\.]у?к?\)?', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'([^а-я]*р[.у][^а-я]б?|р[.у]б?л?е?й?$)', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'\d\d?\d?[-\/,]\d?\d?\d?', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'^(аа|яя)\W?', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'\(\s?\d*\s?[,\.=-]?\s?\d*\s?\)', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'(б[.л]о?к?$|п[а.]ч?к?а?[^а-я])', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'(^\d{2,}\s*|[^а-я]\d{3,}\s*$)', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'(\[\d*|\d*\])', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'\/ш\/', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'[\;\!\\~\+\*]', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'(\/а|п$)', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'пач[.к]?а?', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'\d*\s?МГ', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'папиро?с?ы?', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'\d{1,}р\s?[^а-я]', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'\d{4,}', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'р\.', u'')
df['clean_product_name'] = df['clean_product_name'].str.replace(r'-00', u'')
# df['clean_product_name'] = df['clean_product_name'].str.replace(r'(^ | $)', u'')
df['clean_product_name'] = df['clean_product_name'].str.upper()

In [71]:
# df.sample(100, random_state=65663)
# for i in df.sample(100).clean_product_name:
#     print(i)

**CountVectorizer & TF-IDF**

In [134]:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
X = df['clean_product_name']
y = df['brand_variant_code']
y_category = y.astype('category').cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y_category)

In [135]:
matrix_train = vectorizer.fit_transform(X_train)
tf_idf_train = transformer.fit_transform(matrix_train)
matrix_test = vectorizer.transform(X_test)
tf_idf_test = transformer.transform(matrix_test)

**Algorithm I - Decision Tree**

In [136]:
%%time
clf = tree.DecisionTreeClassifier(splitter='random')
clf.fit(tf_idf_train, y_train)

CPU times: user 40.4 s, sys: 344 ms, total: 40.8 s
Wall time: 40.8 s


In [137]:
predicted = clf.predict(tf_idf_test)

**Accuracy**

In [138]:
sklearn.metrics.accuracy_score(y_test, predicted)

0.9037390480307237

**Algorithm II - Naive Bayes**

In [125]:
%%time
nb = MultinomialNB(alpha=0.01, fit_prior='False')
nb.fit(tf_idf_train, y_train)

CPU times: user 2.37 s, sys: 1.04 s, total: 3.41 s
Wall time: 3.4 s


In [126]:
predicted = nb.predict(tf_idf_test)

**Accuracy**

In [127]:
%%time
sklearn.metrics.accuracy_score(y_test, predicted)

CPU times: user 8.23 ms, sys: 1.1 ms, total: 9.33 ms
Wall time: 7.24 ms


0.8475618879781875

**Algorithm III - Logistic Regression**

In [128]:
%%time
lr = LogisticRegression(random_state=0, solver='liblinear', multi_class='auto').fit(tf_idf_train, y_train)

CPU times: user 38min 54s, sys: 56.4 ms, total: 38min 54s
Wall time: 38min 54s


In [129]:
predicted = lr.predict(tf_idf_test)

**Accuracy**

In [130]:
%%time
sklearn.metrics.accuracy_score(y_test, predicted)

CPU times: user 8.52 ms, sys: 2.11 ms, total: 10.6 ms
Wall time: 8.95 ms


0.8636552561139836