In [2]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC

In [3]:
data_train = pd.read_csv('../../product-reviews-sentiment-analysis-light/products_sentiment_train.tsv', sep = '\t', header = None)
data_test = pd.read_csv('../../product-reviews-sentiment-analysis-light/products_sentiment_test.tsv', sep = '\t')
data_test.head()
#print (data_train[data_train.iloc[:,1]==0].shape)


Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [4]:
def text_classifier(vectorizer,classifier):
    return Pipeline ([("vectorizer", vectorizer),("classifier", classifier)])

Для начала применим частотное преобразование и логистическую регрессию 

In [5]:
vtr = CountVectorizer()
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7684956843480272
STD: 0.007634111236534462


Я попробовал поработать с разными параметрами векторайзера. Результат несколько улучшает добавление n-gram по словам, все остальные добавления (мин. и макс. частота, n-gramы по символам, добавление стоп-слов и т. п.) только ухудшают метрику.

In [6]:
vtr = TfidfVectorizer()
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7665031843949025
STD: 0.011066947966561875


Результат не изменился. Поработаем с параметрами векторайзера. Добавим n-граммы.

In [7]:
vtr = CountVectorizer(ngram_range = (1,2))
clf = LogisticRegression(C = 1)
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7705044437777736
STD: 0.007173639722628096


Результат незначительно улучшился. 
Попробуем n-граммы по буквам

In [8]:
vtr = CountVectorizer(ngram_range = (1,5), analyzer = 'char_wb')
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7579918936993357
STD: 0.010830191552015823


Результат хуже. 
Добавим исключение редких слов.

In [9]:
vtr = CountVectorizer(ngram_range = (1,2), min_df = 10)
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7540106344414652
STD: 0.00872389766833005


Результат хуже. Попробуем исключить стоп-слова.


In [10]:
vtr = CountVectorizer(ngram_range = (1,2), stop_words = 'english')
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7460069062931642
STD: 0.018994913354064887


Улучшений нет. Ограничим по максимальной частоте

In [11]:
vtr = CountVectorizer(ngram_range = (1,2), max_df = 1000)
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7690081844261527
STD: 0.007262698197299536


Попробуем поменять тип векторайзера

In [12]:
vtr = TfidfVectorizer()
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7665031843949025
STD: 0.011066947966561875


Улучшений не наблюдается. Попробуем векторайзер с хешированием.

In [13]:
vtr = HashingVectorizer (lowercase= True, strip_accents = 'unicode', ngram_range = (1,2))
clf = LogisticRegression()
x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()
print ('Mean:', x_mean)
print ('STD:', x_std)



Mean: 0.7400043500271877
STD: 0.012847922905960003


Выполним более систематезированный перебор ряда параметров векторайзера и классификатора (остальные параметры подобраны ранее "за кадром") 

In [14]:
best_mean = 0
C_s = [0.001,0.01,0.1,1,10,100,1000]

for c_ in C_s:
    for i in range(1,5):
        print (i)
        vtrs = [CountVectorizer (lowercase= True,strip_accents = 'unicode', ngram_range = (1,i)),
                HashingVectorizer (strip_accents = 'unicode', ngram_range = (1,i)),
                TfidfVectorizer(lowercase= True,strip_accents = 'unicode', ngram_range = (1,i))]

        clfs  = [LogisticRegression(C = c_, max_iter = 1000, class_weight = 'balanced', penalty = 'l2'),
                 LinearSVC(C = c_, loss = 'squared_hinge', max_iter = 1000, class_weight = 'balanced', random_state = 42), 
                 SGDClassifier(loss = 'log', class_weight = 'balanced', max_iter = 1000, random_state = 42),
                 PassiveAggressiveClassifier (C = c_, loss = 'hinge', max_iter = 1000, class_weight = 'balanced', 
                                              random_state = 42)]

        for vtr in vtrs:
            print (vtr)
            for clf in clfs:
                x_mean = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).mean()
                x_std = cross_val_score(text_classifier(vtr,clf), data_train.iloc[:,0],data_train.iloc[:,1], cv = 5).std()

                if x_mean > best_mean:
                    best_mean =x_mean
                    best_clf = clf
                    best_vtr = vtr


print ("Лучший пайплайн:")
print (best_vtr)
print (best_clf)
print ("Средняя доля правильных ответов")
print (best_mean)

1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




1
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




2
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




3
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




4
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)




HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 4), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None,
         strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None)




TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)




Лучший пайплайн:
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
PassiveAggressiveClassifier(C=1, average=False, class_weight='balanced',
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=1000, n_iter=None, n_iter_no_change=5, n_jobs=None,
              random_state=42, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)
Средняя доля правильных ответов
0.7934882530515817


Победила связка векторайзера TfIdf и классификатора PassiveAggressiveClassifier

Обучим пайплайн на всей выборке

In [15]:
clf_ppl =  Pipeline([("vectorizer",
                      TfidfVectorizer(lowercase= True,strip_accents = 'unicode', ngram_range = (1,3))),
             ("classifier", LogisticRegression(C = 10, max_iter = 1000, class_weight = 'balanced'))])
clf_ppl.fit(data_train.iloc[:,0],data_train.iloc[:,1])




Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

Получим предсказания и запишем их в csv

In [17]:
predicts = pd.DataFrame(clf_ppl.predict(data_test['text']), columns = ['y'])
data_test['y']= predicts.astype('int64')
print(data_test.info())
data_res = data_test.drop(['text'], axis = 1)
data_res.head()
data_res.to_csv('results_new_2.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
Id      500 non-null int64
text    500 non-null object
y       500 non-null int64
dtypes: int64(2), object(1)
memory usage: 11.8+ KB
None


Готово. Результат в png.

In [24]:
from gensim.utils  import lemmatize

In [39]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
list(map(porter_stemmer.stem, "managed cats".split()))
# from nltk.stem.wordnet import WordNetLemmatizer
# lmtzr = WordNetLemmatizer()
# lmtzr.lemmatize("cats people")

['manag', 'cat']

In [46]:
X = data_train.iloc[:,0].apply(lambda x: " ".join(map(lmtzr.lemmatize, x.split())))

In [51]:
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from nltk.stem.wordnet import WordNetLemmatizer



clf_ppl =  Pipeline(
    [
        ("vectorizer", TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,3))),
        (
            "classifier", 
             LogisticRegressionCV(
                max_iter = 1000, 
                class_weight='balanced', 
                cv=RepeatedKFold(n_repeats=50, n_splits=4)
             )
        )
    ]
)
clf_ppl.fit(data_train.iloc[:,0], data_train.iloc[:,1])
# clf_ppl.fit(X, data_train.iloc[:,1])

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))])

In [52]:
predicts = pd.DataFrame(clf_ppl.predict(data_test['text']), columns = ['y'])
data_test['y']= predicts.astype('int64')
print(data_test.info())
data_res = data_test.drop(['text'], axis = 1)
data_res.head()
data_res.to_csv('ti_log_7[cv=50-4].csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
Id      500 non-null int64
text    500 non-null object
y       500 non-null int64
dtypes: int64(2), object(1)
memory usage: 11.8+ KB
None
