In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from transformers.dummies_encoder import DummiesEncoder
from transformers.item_selector import ItemSelector
from transformers.morphology_extractor import MorphologyExtractor
from transformers.dict_class_transformer import DictClassTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from transformers.multi_label_encoder import MultiLabelEncoder
from transformers.reshape_2d import Reshape2D
from transformers.string_splitter import StringSplitter
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
import seaborn as sns

INPUT_PATH = r'../input/norm_challenge_ru'
DATA_INPUT_PATH = r'../input/norm_challenge_ru/ru_with_types'
SUBM_PATH = INPUT_PATH

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
class          100000 non-null object
before         100000 non-null object
before_prev    100000 non-null object
before_next    100000 non-null object
before_len     100000 non-null int64
before_wc      100000 non-null int64
dtypes: int64(2), object(4)
memory usage: 4.6+ MB
None


In [3]:
df = pd.read_csv(os.path.join(INPUT_PATH, 'ru_train.csv'),
                 encoding='utf-8',
                 index_col=False,
                 usecols=['before', 'class'])
df = df.head(100000)

df['before_prev'] = df['before'].shift(1)
df['before_next'] = df['before'].shift(-1)
df['before_len'] = df['before'].str.len()
df['before_wc'] = df['before'].map(lambda x: len(str(x).split()))
df = df.fillna('')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
class          100000 non-null object
before         100000 non-null object
before_prev    100000 non-null object
before_next    100000 non-null object
before_len     100000 non-null int64
before_wc      100000 non-null int64
dtypes: int64(2), object(4)
memory usage: 4.6+ MB
None


In [8]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('chars', Pipeline([
            ('select', ItemSelector('before')),
            ('split', StringSplitter())
        ])),
        ('context', Pipeline([
            ('select', ItemSelector('before')),
            ('extract', MorphologyExtractor()),
            ('one_hot', DummiesEncoder())
        ])),
        ('chars_prev', Pipeline([
            ('select', ItemSelector('before_prev')),
            ('split', StringSplitter())
        ])),
        ('context_prev', Pipeline([
            ('select', ItemSelector('before_prev')),
            ('extract', MorphologyExtractor()),
            ('one_hot', DummiesEncoder())
        ])),
        ('chars_next', Pipeline([
            ('select', ItemSelector('before_next')),
            ('split', StringSplitter())
        ])),
        ('context_next', Pipeline([
            ('select', ItemSelector('before_next')),
            ('extract', MorphologyExtractor()),
            ('one_hot', DummiesEncoder())
        ])),
        ('length', Pipeline([
            ('select', ItemSelector('before_len')),
            ('reshape', Reshape2D())
        ])),
        ('words_count', Pipeline([
            ('select', ItemSelector('before_wc')),
            ('reshape', Reshape2D())
        ])),
    ], n_jobs=1)),
])

x_data = pipeline.fit_transform(df.drop(['class'], axis=1))

y_data = pd.factorize(df['class'])
labels = y_data[1]
y_data = y_data[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
class          100000 non-null object
before         100000 non-null object
before_prev    100000 non-null object
before_next    100000 non-null object
before_len     100000 non-null int64
before_wc      100000 non-null int64
dtypes: int64(2), object(4)
memory usage: 4.6+ MB
None


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=2017)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
class          100000 non-null object
before         100000 non-null object
before_prev    100000 non-null object
before_next    100000 non-null object
before_len     100000 non-null int64
before_wc      100000 non-null int64
dtypes: int64(2), object(4)
memory usage: 4.6+ MB
None


In [10]:
model = xgb.XGBClassifier()
n_estimators = range(50, 400, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2017)
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(n_estimators, means, yerr=stds)
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel('n_estimators')
pyplot.ylabel('Accuracy')
pyplot.savefig('n_estimators.png')

Fitting 10 folds for each of 7 candidates, totalling 70 fits




KeyboardInterrupt: 

In [None]:
model = XGBClassifier()
max_depth = range(1, 11, 2)
print(max_depth)
param_grid = dict(max_depth=max_depth)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2017)
grid_search = GridSearchCV(model, param_grid, scoring="Accuracy", n_jobs=-1, cv=kfold, verbose=1)
grid_result = grid_search.fit(X, label_encoded_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(max_depth, means, yerr=stds)
pyplot.title("XGBoost max_depth vs Log Loss")
pyplot.xlabel('max_depth')
pyplot.ylabel('Accuracy')
pyplot.savefig('max_depth.png')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
class          100000 non-null object
before         100000 non-null object
before_prev    100000 non-null object
before_next    100000 non-null object
before_len     100000 non-null int64
before_wc      100000 non-null int64
dtypes: int64(2), object(4)
memory usage: 4.6+ MB
None
