In [1]:
import pandas as pd
import numpy as np

import re

import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, pipeline, model_selection, naive_bayes
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb

import time

from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [3]:
pd.set_option('display.max_columns', 200)
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

In [5]:
X_train=train['title'].str.replace('[^a-zA-Z0-9]', ' ')
Y_train = LabelEncoder().fit_transform(train['topic_idx'])
y_train=train['topic_idx']
X_test=test['title'].str.replace('[^a-zA-Z0-9]', ' ')

In [14]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 7])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
                  'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.accuracy_score(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0"] = pred_train[ : , 0]
train["tfidf_MNB_1"] = pred_train[ : , 1]
train["tfidf_MNB_2"] = pred_train[ : , 2]
train["tfidf_MNB_3"] = pred_train[ : , 3]
train["tfidf_MNB_4"] = pred_train[ : , 4]
train["tfidf_MNB_5"] = pred_train[ : , 5]
train["tfidf_MNB_6"] = pred_train[ : , 6]

test["tfidf_MNB_0"] = pred_full_test[ : , 0]
test["tfidf_MNB_1"] = pred_full_test[ : , 1]
test["tfidf_MNB_2"] = pred_full_test[ : , 2]
test["tfidf_MNB_3"] = pred_full_test[ : , 3]
test["tfidf_MNB_4"] = pred_full_test[ : , 4]
test["tfidf_MNB_5"] = pred_full_test[ : , 5]
test["tfidf_MNB_6"] = pred_full_test[ : , 6]

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021/07/19 15:33
Fitting 2 folds for each of 4 candidates, totalling 8 fits
	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)


ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [13]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# Final Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 5
#     param['silent'] = 1
    param['num_class'] = 5
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(train, test, Y_train):
    drop_columns=['topic_idx', "title"]
    x_train = train
    x_test = test
    y_train = Y_train
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([x_train.shape[0], 5])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("cv score : ", cv_scores)
    print("Mean cv score : ", np.mean(cv_scores))
    return pred_full_test/5
result = do(train, test, Y_train)

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021/07/19 15:25


ValueError: DataFrame.dtypes for data must be int, float, bool or categorical.  When
                categorical type is supplied, DMatrix parameter
                `enable_categorical` must be set to `True`.title