In [None]:
from sklearn.datasets import fetch_20newsgroups
from collections import Counter

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report

In [None]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

### Using LogisticRegression

In [None]:
# best_score = 0.8634429567463328 при LogisticRegression(C=1)

pipeline_cv = Pipeline([
    ('bow', CountVectorizer()
    ),
  
    ('clf', LogisticRegression(С=1)),
])

In [None]:
# best_score = 0.9079011664524309 при LogisticRegression(C=10)

pipeline_tv = Pipeline([
    ('bow',  TfidfVectorizer()),
  
    ('clf', LogisticRegression(С=10)),
])

In [None]:
params = dict(clf__C=[10, 1, 0.1, 0.01])
grid_search_cv = GridSearchCV(pipeline_cv, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [None]:
params = dict(clf__C=[10, 1, 0.1, 0.01])
grid_search_tv = GridSearchCV(pipeline_tv, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [None]:
grid_search_cv.fit(train["data"], train["target"])

In [None]:
grid_search_tv.fit(train["data"], train["target"])

In [None]:
grid_search_cv.best_score_, grid_search_cv.best_estimator_

In [None]:
grid_search_tv.best_score_, grid_search_tv.best_estimator_

In [None]:
# pipeline = Pipeline([
#     ('bow', CountVectorizer()),
#     ('clf', LogisticRegression(C=1)),
# ])

pipeline_cv.fit(train["data"], train["target"])
predictions = pipeline_cv.predict(test["data"])
accuracy_score(test["target"], predictions)

In [None]:
pipeline_tv.fit(train["data"], train["target"])
predictions = pipeline_tv.predict(test["data"])
accuracy_score(test["target"], predictions)

In [None]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

### Using  XGBoost 

In [None]:
import numpy as np 
# import pandas as pd 
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy


train_mes, valid_mes, train_l, valid_l = train_test_split(train['data'],
                                                          train['target'],
                                                          test_size=0.2, 
                                                          random_state=2)



tv = TfidfVectorizer(
                  ngram_range=(1,2),
                  min_df=3,
                  max_df=0.9, 
                  strip_accents='unicode',
                  use_idf=1,
                  smooth_idf=1,
                  sublinear_tf=1)

transform_com  = tv.fit(train['data'])


comments_train = transform_com.transform(train_mes)
comments_valid = transform_com.transform(valid_mes)

comments_test = transform_com.transform(test['data'])

In [None]:
import xgboost as xgb

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'multi:softmax'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'mlogloss'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    param['num_class'] = 100
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [None]:
train = [np.int64(v) for v in train_l]
valid = [np.int64(v) for v in valid_l]

In [None]:
model = runXGB(comments_train, train, comments_valid, valid)
preds= model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)