import nltk

nltk.download()

In [74]:
import pandas as pd
import numpy as np
import sklearn.model_selection as ms

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

from sklearn import metrics

from nltk.corpus import stopwords

# Разобраться с данными

In [2]:
ads = pd.read_csv('train.csv', index_col='item_id', encoding= 'utf-8')

In [3]:
ads.head(10)

Unnamed: 0_level_0,title,description,price,category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Картина,Гобелен. Размеры 139х84см.,1000.0,19
1,Стулья из прессованной кожи,Продам недорого 4 стула из светлой прессованно...,1250.0,22
2,Домашняя мини баня,"Мини баня МБ-1(мини сауна), предназначена для ...",13000.0,37
3,"Эксклюзивная коллекция книг ""Трансаэро"" + подарок","Продам эксклюзивную коллекцию книг, выпущенную...",4000.0,43
4,Ноутбук aser,Продаётся ноутбук ACER e5-511C2TA. Куплен в ко...,19000.0,1
5,Бас гитара invasion bg110,Состояние хорошее. Имеется теплый чехол .,3999.0,50
6,"Смесь ""Грудничок"" г. Зеленодольск",Смесь молочная адаптированная ультрапастеризов...,15.0,41
7,G-shock,Часы абсолютно новые! с коробкой. Часы Китай...,2500.0,36
8,"Санатории Белоруссии. - ""Лепельский военный""",Санатории Белоруссии! - «Лепельский военный» ...,1090.0,48
9,Фотохолст,Фотохолст на подрамнике. 36х58см. Галерейная н...,1250.0,19


In [4]:
ads.isnull().any()

title          False
description    False
price          False
category_id    False
dtype: bool

In [5]:
ads.shape

(489517, 4)

In [6]:
class_frequency = pd.crosstab(index=ads.category_id, columns="count")

In [7]:
class_frequency

col_0,count
category_id,Unnamed: 1_level_1
0,8862
1,8022
2,9887
3,8604
4,8616
5,8241
6,8697
7,8592
8,8698
9,8033


In [8]:
target = ads.category_id
data = ads[['title','description','price']]

In [9]:
print(target.min(), target.max())

0 53


# Таким образом:

1) Наивный Байес 
    
2) Метод опорных векторов
    
3) Логистическая регрессия

### Наивный Баес

In [10]:
def preprocessingNB(data):
    return pd.DataFrame({'full_descr': data.title +' '+data.description})

In [11]:
dataNB = preprocessingNB(data)

In [12]:
dataNB.head(10)

Unnamed: 0_level_0,full_descr
item_id,Unnamed: 1_level_1
0,Картина Гобелен. Размеры 139х84см.
1,Стулья из прессованной кожи Продам недорого 4 ...
2,"Домашняя мини баня Мини баня МБ-1(мини сауна),..."
3,"Эксклюзивная коллекция книг ""Трансаэро"" + пода..."
4,Ноутбук aser Продаётся ноутбук ACER e5-511C2TA...
5,Бас гитара invasion bg110 Состояние хорошее. И...
6,"Смесь ""Грудничок"" г. Зеленодольск Смесь молочн..."
7,G-shock Часы абсолютно новые! с коробкой. Часы...
8,"Санатории Белоруссии. - ""Лепельский военный"" С..."
9,Фотохолст Фотохолст на подрамнике. 36х58см. Га...


In [13]:
X_train, X_test, y_train, y_test = ms.train_test_split(dataNB, target, test_size = 0.3, stratify = target)

#### Лучшие найденные параметры для этого:

min_df : 2 

max_df : 0.05 

alpha  : 0.4 

In [14]:
stop_words = stopwords.words('russian')

In [20]:
%%time
count_vect = CountVectorizer(min_df=2, max_df = 0.05, stop_words = stop_words)
count_vect.fit(X_train.full_descr.tolist())

Wall time: 15.6 s


In [21]:
%%time
X_train_counts = count_vect.transform(X_train.full_descr.tolist())
X_test_counts = count_vect.transform(X_test.full_descr.tolist())

Wall time: 19.4 s


In [22]:
%%time
clf = MultinomialNB(alpha = 0.4).fit(X_train_counts, y_train)
predicted = clf.predict(X_test_counts)
acc = metrics.accuracy_score(y_test, predicted)
print(acc)

0.866175028599
Wall time: 1.89 s


Исследуем как влияет каждый параметр в отдельности:

In [83]:
def check_vect_MultiNB(parameters_grid):
    pipe_counts_MultiNB = Pipeline(steps=[('vect', CountVectorizer(stop_words=stop_words)),
                                          ('clf', MultinomialNB())])

    cv = ms.StratifiedShuffleSplit(n_splits=2)
    grid_cv = ms.GridSearchCV(pipe_counts_MultiNB, parameters_grid, scoring = 'accuracy', n_jobs=5, cv=cv)
    
    grid_cv.fit(X_train.full_descr.tolist(), y_train)
    test_predictions = grid_cv.best_estimator_.predict(X_test.full_descr.tolist())
    acc = metrics.accuracy_score(y_test, test_predictions)
    print('accuracy =', acc)
    print('best :', grid_cv.best_params_)
    print('Отчет:')
    for t in zip(grid_cv.cv_results_['params'], grid_cv.cv_results_['mean_test_score']):
        print(t)
    
    return grid_cv

In [88]:
%%time
vect__min_df = [1, 2, 3, 4, 5]
parameters_grid = {'vect__min_df' : vect__min_df}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.861170125838
best : {'vect__min_df': 2}
Отчет:
({'vect__min_df': 1}, 0.85999649808853995)
({'vect__min_df': 2}, 0.86097411503779142)
({'vect__min_df': 3}, 0.86033209793679055)
({'vect__min_df': 4}, 0.85998190679079001)
({'vect__min_df': 5}, 0.85875623777978816)
Wall time: 2min 23s


In [89]:
c = CountVectorizer(stop_words=stop_words).fit(X_train.full_descr.tolist(), y_train).vocabulary_
print(len(c))

419320


In [90]:
c = CountVectorizer(stop_words=stop_words, min_df=2).fit(X_train.full_descr.tolist(), y_train).vocabulary_
print(len(c))

193131


На стандартном качество: 0.85

Так же для стандартного указать размер словаря

http://www.myvocab.info/articles/slovarniy-zapas-nositeley-russkogo-yazyka-vliyanie-vozrasta-i-obrazovaniya
    от 30000 до 100000

In [91]:
%%time
vect__max_features = [30000, 60000, 90000, 120000, 150000, 180000, 210000]
parameters_grid = {'vect__min_df' : [2],
                   'vect__max_features' : vect__max_features}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.861170125838
best : {'vect__max_features': 210000, 'vect__min_df': 2}
Отчет:
({'vect__max_features': 30000, 'vect__min_df': 2}, 0.84346455773776519)
({'vect__max_features': 60000, 'vect__min_df': 2}, 0.85291971867977934)
({'vect__max_features': 90000, 'vect__min_df': 2}, 0.85612980418478424)
({'vect__max_features': 120000, 'vect__min_df': 2}, 0.85760352525753636)
({'vect__max_features': 150000, 'vect__min_df': 2}, 0.85815799457203723)
({'vect__max_features': 180000, 'vect__min_df': 2}, 0.85842063793153767)
({'vect__max_features': 210000, 'vect__min_df': 2}, 0.85850818571803778)
Wall time: 3min 7s


In [92]:
c = CountVectorizer(stop_words=stop_words, min_df=2, max_features=210000).fit(X_train.full_descr.tolist(), y_train).vocabulary_
print(len(c))

193131


Гнаться смысла нет

In [93]:
%%time
vect__max_df = [1.0, 0.8, 0.6, 0.5, 0.1]
parameters_grid = {'vect__min_df': [2],
                   'vect__max_df' : vect__max_df}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.863090374244
best : {'vect__max_df': 0.1, 'vect__min_df': 2}
Отчет:
({'vect__max_df': 1.0, 'vect__min_df': 2}, 0.85821635976303734)
({'vect__max_df': 0.8, 'vect__min_df': 2}, 0.85821635976303734)
({'vect__max_df': 0.6, 'vect__min_df': 2}, 0.85821635976303734)
({'vect__max_df': 0.5, 'vect__min_df': 2}, 0.85821635976303734)
({'vect__max_df': 0.1, 'vect__min_df': 2}, 0.86050719350979077)
Wall time: 2min 25s


In [94]:
c = CountVectorizer(stop_words=stop_words, min_df=2, max_df = 0.1).fit(X_train.full_descr.tolist(), y_train).vocabulary_
print(len(c))

193127


In [95]:
%%time
vect__max_df = [0.1, 0.09, 0.08, 0.05, 0.01]
parameters_grid = {'vect__min_df': [2],
                   'vect__max_df' : vect__max_df}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.864261589584
best : {'vect__max_df': 0.05, 'vect__min_df': 2}
Отчет:
({'vect__max_df': 0.1, 'vect__min_df': 2}, 0.86225814923979338)
({'vect__max_df': 0.09, 'vect__min_df': 2}, 0.86233110572854355)
({'vect__max_df': 0.08, 'vect__min_df': 2}, 0.86234569702629349)
({'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86387778329004583)
({'vect__max_df': 0.01, 'vect__min_df': 2}, 0.84355210552426529)
Wall time: 2min 28s


In [96]:
c = CountVectorizer(stop_words=stop_words, min_df=2, max_df = 0.05).fit(X_train.full_descr.tolist(), y_train).vocabulary_
print(len(c))

193110


In [98]:
%%time
vect__max_df = [0.05, 0.04, 0.03]
parameters_grid = {'vect__min_df': [2],
                   'vect__max_df' : vect__max_df}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.864261589584
best : {'vect__max_df': 0.05, 'vect__min_df': 2}
Отчет:
({'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86360054863279545)
({'vect__max_df': 0.04, 'vect__min_df': 2}, 0.86263752298129392)
({'vect__max_df': 0.03, 'vect__min_df': 2}, 0.86275425336329414)
Wall time: 1min 46s


In [100]:
%%time
alpha = [1.0, 0.8, 0.6, 0.4, 0.2]
parameters_grid = {'vect__min_df': [2],
                   'vect__max_df' : [0.05],
                   'clf__alpha' : alpha}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.866815111402
best : {'clf__alpha': 0.4, 'vect__max_df': 0.05, 'vect__min_df': 2}
Отчет:
({'clf__alpha': 1.0, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86472407855954714)
({'clf__alpha': 0.8, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86542446085154812)
({'clf__alpha': 0.6, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86618320833454931)
({'clf__alpha': 0.4, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86721919047480078)
({'clf__alpha': 0.2, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86679604284005018)
Wall time: 2min 15s


In [101]:
%%time
alpha = [0.5, 0.4, 0.3]
parameters_grid = {'vect__min_df': [2],
                   'vect__max_df' : [0.05],
                   'clf__alpha' : alpha}
ans = check_vect_MultiNB(parameters_grid)

accuracy = 0.866815111402
best : {'clf__alpha': 0.4, 'vect__max_df': 0.05, 'vect__min_df': 2}
Отчет:
({'clf__alpha': 0.5, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86530773046954801)
({'clf__alpha': 0.4, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86533691306504801)
({'clf__alpha': 0.3, 'vect__max_df': 0.05, 'vect__min_df': 2}, 0.86491376543029741)
Wall time: 1min 48s


Добавим TF-IDF

In [102]:
def check_TfidfVec_MultiNB(parameters_grid):
    pipe_counts_MultiNB = Pipeline(steps=[('TfidfVec', TfidfVectorizer(stop_words=stop_words,
                                                                   min_df=2,
                                                                   max_df=0.5)),
                                          ('clf', MultinomialNB())])

    cv = ms.StratifiedShuffleSplit(n_splits=2)
    grid_cv = ms.GridSearchCV(pipe_counts_MultiNB, parameters_grid, scoring = 'accuracy', n_jobs=5, cv=cv)
    
    grid_cv.fit(X_train.full_descr.tolist(), y_train)
    test_predictions = grid_cv.best_estimator_.predict(X_test.full_descr.tolist())
    acc = metrics.accuracy_score(y_test, test_predictions)
    print('accuracy =', acc)
    print('best :', grid_cv.best_params_)
    print('Отчет:')
    for t in zip(grid_cv.cv_results_['params'], grid_cv.cv_results_['mean_test_score']):
        print(t)
    
    return grid_cv

In [105]:
%%time

parameters_grid = {'TfidfVec__sublinear_tf': [True, False]} #,
                   #'TfidfVec__norm' : ['l1', 'l2', None],
                   #'TfidfVec__use_idf' : [True, False]}

ans = check_TfidfVec_MultiNB(parameters_grid)

accuracy = 0.864343302283
best : {'TfidfVec__sublinear_tf': False}
Отчет:
({'TfidfVec__sublinear_tf': True}, 0.86056555870079088)
({'TfidfVec__sublinear_tf': False}, 0.86196632328479295)
Wall time: 1min 16s


In [106]:
ans.cv_results_

{'mean_fit_time': array([ 22.8700099 ,  23.93034196]),
 'mean_score_time': array([ 2.70836639,  2.00045228]),
 'mean_test_score': array([ 0.86056556,  0.86196632]),
 'mean_train_score': array([ 0.89230335,  0.89278974]),
 'param_TfidfVec__sublinear_tf': masked_array(data = [True False],
              mask = [False False],
        fill_value = ?),
 'params': [{'TfidfVec__sublinear_tf': True},
  {'TfidfVec__sublinear_tf': False}],
 'rank_test_score': array([2, 1]),
 'split0_test_score': array([ 0.85904806,  0.8603321 ]),
 'split0_train_score': array([ 0.89242657,  0.89281568]),
 'split1_test_score': array([ 0.86208305,  0.86360055]),
 'split1_train_score': array([ 0.89218013,  0.8927638 ]),
 'std_fit_time': array([ 0.11923885,  0.05425787]),
 'std_score_time': array([ 0.06826901,  0.12542558]),
 'std_test_score': array([ 0.00151749,  0.00163423]),
 'std_train_score': array([  1.23218999e-04,   2.59408419e-05])}

In [114]:
%%time

parameters_grid = {'TfidfVec__sublinear_tf': [True, False],
                   'TfidfVec__norm' : ['l1', 'l2', None],
                   'TfidfVec__use_idf' : [True, False]}

ans = check_TfidfVec_MultiNB(parameters_grid)

accuracy = 0.864343302283
best : {'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True}
Отчет:
({'TfidfVec__norm': 'l1', 'TfidfVec__sublinear_tf': True, 'TfidfVec__use_idf': True}, 0.83587708290775387)
({'TfidfVec__norm': 'l1', 'TfidfVec__sublinear_tf': True, 'TfidfVec__use_idf': False}, 0.80792015641871184)
({'TfidfVec__norm': 'l1', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True}, 0.83898502932850849)
({'TfidfVec__norm': 'l1', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': False}, 0.81216622406396821)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': True, 'TfidfVec__use_idf': True}, 0.86025914144804039)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': True, 'TfidfVec__use_idf': False}, 0.84337700995126508)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True}, 0.86168908862754257)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': False}, 0.84540520033851807)
({'TfidfVec

In [116]:
%%time
alpha = alpha = [0.8, 0.6, 0.5, 0.4, 0.3]
parameters_grid = {'TfidfVec__sublinear_tf': [False],
                   'TfidfVec__norm' : ['l2'],
                   'TfidfVec__use_idf' : [True],
                   'clf__alpha': alpha}

ans = check_TfidfVec_MultiNB(parameters_grid)

accuracy = 0.871363784932
best : {'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.3}
Отчет:
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.8}, 0.86389237458779589)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.6}, 0.86548282604254823)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.5}, 0.86665012986254997)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.4}, 0.86789039017130187)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.3}, 0.86921819826655378)
Wall time: 2min 23s


In [117]:
%%time
alpha = [0.35, 0.3, 0.25, 0.2, 0.1]
parameters_grid = {'TfidfVec__sublinear_tf': [False],
                   'TfidfVec__norm' : ['l2'],
                   'TfidfVec__use_idf' : [True],
                   'clf__alpha': alpha}

ans = check_TfidfVec_MultiNB(parameters_grid)

accuracy = 0.872909516806
best : {'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.1}
Отчет:
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.35}, 0.86946625032830416)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.3}, 0.87002071964280503)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.25}, 0.87054600636180579)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.2}, 0.87117343216505672)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.1}, 0.87264715323780895)
Wall time: 2min 27s


In [119]:
%%time
alpha = [0.1, 0.01, 0.001]
parameters_grid = {'TfidfVec__sublinear_tf': [False],
                   'TfidfVec__norm' : ['l2'],
                   'TfidfVec__use_idf' : [True],
                   'clf__alpha': alpha}

ans = check_TfidfVec_MultiNB(parameters_grid)

accuracy = 0.872909516806
best : {'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.1}
Отчет:
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.1}, 0.87267633583330906)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.01}, 0.86602270405929904)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.001}, 0.85379519654478064)
Wall time: 1min 54s


In [121]:
%%time
alpha = [0.09, 0.08, 0.07, 0.06, 0.05]
parameters_grid = {'TfidfVec__sublinear_tf': [False],
                   'TfidfVec__norm' : ['l2'],
                   'TfidfVec__use_idf' : [True],
                   'clf__alpha': alpha}

ans = check_TfidfVec_MultiNB(parameters_grid)

accuracy = 0.872616712971
best : {'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.08}
Отчет:
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.09}, 0.87143607552455715)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.08}, 0.87155280590655737)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.07}, 0.87115884086730677)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.06}, 0.87085242361455628)
({'TfidfVec__norm': 'l2', 'TfidfVec__sublinear_tf': False, 'TfidfVec__use_idf': True, 'clf__alpha': 0.05}, 0.87031254559780546)
Wall time: 2min 28s


#### Таким образом, лучшие параметры:
min_df : 2

max_df : 0.05

sublinear_tf : False

norm : 'l2'

use_idf : True

alpha : 0.08

## Метод опорных векторов

In [78]:
def preprocessingSVM(data):
    return pd.DataFrame({'full_descr': data.title +' '+data.description, 'price': data.price})

In [79]:
dataSVM = preprocessingSVM(data)

In [80]:
dataSVM.head()

Unnamed: 0_level_0,full_descr,price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Картина Гобелен. Размеры 139х84см.,1000.0
1,Стулья из прессованной кожи Продам недорого 4 ...,1250.0
2,"Домашняя мини баня Мини баня МБ-1(мини сауна),...",13000.0
3,"Эксклюзивная коллекция книг ""Трансаэро"" + пода...",4000.0
4,Ноутбук aser Продаётся ноутбук ACER e5-511C2TA...,19000.0


In [81]:
X_train, X_test, y_train, y_test = ms.train_test_split(dataSVM, target, test_size = 0.3, stratify = target)

In [82]:
def check_TfidfVec_SVM(parameters_grid):
       
    pipe = Pipeline(steps = [
        ('feature_preprocessing', FeatureUnion(transformer_list = [
            ('text', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data[:,'full_descr'].tolist())),
                ('TfidfVec', TfidfVectorizer(stop_words = stop_words,
                                             min_df = 2,
                                             max_df = 0.5,
                                             sublinear_tf = False,
                                             norm = 'l2',
                                             use_idf = True))
            ])),
            ('price', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data[:,'price'])),
                ('scaling', StandardScaler())                
            ]))
        ])),
        ('clf', SGDClassifier())
    ])

    cv = ms.StratifiedShuffleSplit(n_splits=2)
    grid_cv = ms.GridSearchCV(pipe, parameters_grid, scoring = 'accuracy', n_jobs=5, cv=cv)
    
    grid_cv.fit(X_train, y_train)
    test_predictions = grid_cv.best_estimator_.predict(X_test)
    acc = metrics.accuracy_score(y_test, test_predictions)
    print('accuracy =', acc)
    print('best :', grid_cv.best_params_)
    print('Отчет:')
    for t in zip(grid_cv.cv_results_['params'], grid_cv.cv_results_['mean_test_score']):
        print(t)
    
    return grid_cv

In [83]:
%%time

parameters_grid = {}

ans = check_TfidfVec_SVM(parameters_grid)



AttributeError: Can't pickle local object 'check_TfidfVec_SVM.<locals>.<lambda>'

In [87]:
col1 = ['full_descr']
col1_ind = np.array([(column in col1) for column in X_train.columns], dtype = bool)
col2 = ['price']
col2_ind = np.array([(column in col2) for column in X_train.columns], dtype = bool)

pipe = Pipeline(steps = [
        ('feature_preprocessing', FeatureUnion(transformer_list = [
            ('text', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data[:, col1_ind])),
                ('TfidfVec', TfidfVectorizer())
            ])),
            ('price', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data[:, col2_ind])),
                ('scaling', StandardScaler())                
            ]))
        ])),
        ('clf', SGDClassifier())
    ])



In [88]:
pipe.fit(X_train, y_train)

ValueError: could not convert string to float: 'Стиральная машина полуавтомат фея Российского производства'

In [85]:
col1_ind

array([ True, False], dtype=bool)

In [86]:
col2_ind

array([False,  True], dtype=bool)

In [32]:
%%time
count_vect = TfidfVectorizer(min_df=2, max_df = 0.05, stop_words = stop_words)
count_vect.fit(X_train.full_descr.tolist())

Wall time: 15.7 s


In [33]:
%%time
X_train_counts = count_vect.transform(X_train.full_descr.tolist())
X_test_counts = count_vect.transform(X_test.full_descr.tolist())

Wall time: 19.9 s


In [34]:
%%time
clf = MultinomialNB(alpha = 0.4).fit(X_train_counts, y_train)
predicted = clf.predict(X_test_counts)
acc = metrics.accuracy_score(y_test, predicted)
print(acc)

0.870676036389
Wall time: 1.86 s
