In [1]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, log_loss, recall_score, f1_score, classification_report

from tokenizers import BertWordPieceTokenizer, SentencePieceBPETokenizer
from lightgbm import LGBMClassifier

In [2]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

In [22]:
def clean(x):
    x = x.lower()
    x = re.sub(f'[0-9]', 'N', x)
    x = re.sub(r'(\\n)', ' ', x)
    x = re.sub(r',', ' ', x)
    x = re.sub(r'[^a-zA-Zㄱ-ㅣ가-힣\s]+', '',x)
    x = re.sub(r'N', '', x)
    x = re.sub(r'jan|feb|oct|dec|mar|nov|sep', '', x)
    x = re.sub(r'\s[a-zA-Z]\s', ' ', x)
    x = re.sub(r'\s+', ' ', x)
    return x.strip()

In [23]:
%%time

train_df['clean_log'] = train_df['full_log'].map(lambda x: clean(x))
# test_df['clean_log'] = test_df['full_log'].map(lambda x: clean(x))

CPU times: user 23.4 s, sys: 79.9 ms, total: 23.5 s
Wall time: 23.5 s


In [34]:
new_train = train_df.pivot_table(index='clean_log',
                                 values='id',
                                 columns='level',
                                 aggfunc='count',
                                 fill_value = 0).reset_index()
new_train.columns.name = None
new_train['total'] = np.sum(new_train.iloc[:,1:], axis=1)
new_train.iloc[:,1:-1] = new_train.iloc[:,1:-1].div(new_train.iloc[:,-1], axis=0)
new_train = new_train.drop(['total'], axis=1)

In [46]:
new_train.head()

Unnamed: 0,clean_log,0,1,2,3,4,5,6
0,e proberpminfo rpm db error from dbenvfailchk ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,e proberpminfo rpm rpmdb bdb threadprocess fai...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,e proberpmverifyfile rpm db error from dbenvfa...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,e proberpmverifyfile rpm rpmdb bdb threadproce...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,error cannot open packages database in,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
new_train['new_level'] = np.argmax(new_train.iloc[:,1:].values, axis=1).astype('int')

## Final - Stage

In [59]:
# train_text = train_df['final_log'].to_list()
train_text = new_train['clean_log'].to_list()
train_level = new_train.iloc[:, 1:-1].values
level = new_train.iloc[:, -1]

In [37]:
vectorizer3=CountVectorizer(analyzer="word", 
                            max_features=20000, 
                            ngram_range=(1,1), 
                            min_df=2,
                            dtype=np.float32)

train_features=vectorizer3.fit_transform(train_text)

In [38]:
train_features

<53910x13407 sparse matrix of type '<class 'numpy.float32'>'
	with 2266795 stored elements in Compressed Sparse Row format>

In [96]:
# test_text=test_df['clean_log'].to_list()
# test_features=vectorizer3.transform(test_text)

In [39]:
TEST_SIZE = 0.3
RANDOM_SEED = 42

train_x, eval_x, train_y, eval_y = train_test_split(train_features, train_level,
                                                    stratify = train_level,
                                                    shuffle = True,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_SEED)

clf = LogisticRegression(random_state=42,
                         solver='newton-cg',
                         max_iter=1000,
                         n_jobs=-1)

clf.fit(train_x, train_y)
clf_pred = clf.predict_proba(eval_x)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [98]:
print("CV Accuracy score: {:<8.5f}".format(accuracy_score(eval_y, np.argmax(clf_pred, axis=1))))
print("CV Precision score: {:<8.5f}".format(precision_score(eval_y, np.argmax(clf_pred, axis=1), average='macro')))
print("CV Recall score: {:<8.5f}".format(recall_score(eval_y, np.argmax(clf_pred, axis=1), average='macro')))
print("CV F1 score: {:<8.5f}".format(f1_score(eval_y, np.argmax(clf_pred, axis=1), average='macro')))

CV Accuracy score: 0.98696 
CV Precision score: 0.54388 
CV Recall score: 0.53272 
CV F1 score: 0.53798 


  _warn_prf(average, modifier, msg_start, len(result))


In [89]:
print(classification_report(eval_y, np.argmax(clf_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88       823
           1       0.99      0.99      0.99     15000
           2       0.00      0.00      0.00         1
           3       0.96      0.96      0.96       157
           4       0.00      0.00      0.00         1
           5       0.97      0.88      0.92       193
           6       0.00      0.00      0.00         1

    accuracy                           0.99     16176
   macro avg       0.54      0.53      0.54     16176
weighted avg       0.99      0.99      0.99     16176



  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
valid_df = pd.read_csv('./dataset/validation_sample.csv')
valid_df['clean_log'] = valid_df['full_log'].map(lambda x: clean(x))
# valid_df['first_word'] = valid_df['clean_log'].map(lambda x: truncated_string(x, 32))

valid_text=valid_df['clean_log'].to_list()
valid_features=vectorizer3.transform(valid_text)

eval_results = clf.predict_proba(valid_features)

np.set_printoptions(precision=3, suppress=True)
print(eval_results)

[[0.    1.    0.    0.    0.    0.    0.   ]
 [0.995 0.    0.    0.    0.    0.005 0.   ]
 [0.061 0.929 0.001 0.002 0.    0.006 0.002]]


In [27]:
results = clf.predict_proba(test_features)

In [28]:
final=np.argmax(results, axis=1)
final[np.where(np.max(results, axis=1) < 0.9)] = 7

In [29]:
from collections import Counter
Counter(final)

Counter({0: 999679,
         1: 396206,
         3: 12952,
         5: 6429,
         7: 3557,
         2: 34,
         4: 34,
         6: 25})

In [30]:
submission_df = pd.read_csv('./dataset/sample_submission.csv')
submission_df['level'] = final

In [31]:
submission_df.to_csv('./logistic_baseline.csv', index=False)

In [61]:
param = {'boosting_type':'gbdt',
         'max_depth':9, 
         'num_leaves':31,
         'objective': 'multi_logloss',
         'n_estimators':1000, 
         'learning_rate':0.01, 
         'subsample':0.8, 
         'colsample_bytree':0.8,
#          'reg_alpha':0.5, 
#          'reg_lambda':1.0, 
         'random_state':42
        }

In [65]:
level.value_counts()

1    49746
0     3034
5      602
3      520
6        5
4        2
2        1
Name: new_level, dtype: int64

In [64]:
oof_lgb = np.zeros((len(new_train), 7))
lgb_pred = np.zeros((len(test_df), 7))
eval_results = np.zeros((3, 7))

valid_df = pd.read_csv('./dataset/validation_sample.csv')
valid_df['clean_log'] = valid_df['full_log'].map(lambda x: clean(x))
valid_text=valid_df['clean_log'].to_list()
valid_features=vectorizer3.transform(valid_text)


skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_features, level)):
    print("\nfold num_: {}".format(fold_))
    y_train, y_valid = level[trn_idx], level[val_idx]
    X_train, X_valid = train_features[trn_idx], train_features[val_idx]
    
    clf = LGBMClassifier(**param)
    clf.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=50,
            verbose=50)
    
    oof_lgb[val_idx] = clf.predict_proba(X_valid, num_iteration=clf.best_iteration_)   
    eval_results += clf.predict_proba(valid_features, num_iteration=clf.best_iteration_)
#     lgb_pred += clf.predict_proba(test_features, num_iteration=clf.best_iteration_)

eval_results /= 2
# lgb_pred /= 3    
print('\nCross Validation Is Complete')
print("CV Accuracy score: {:<8.5f}".format(accuracy_score(train_level, np.argmax(oof_lgb, axis=1))))
print("CV Precision score: {:<8.5f}".format(precision_score(train_level, np.argmax(oof_lgb, axis=1), average='macro')))
print("CV Recall score: {:<8.5f}".format(recall_score(train_level, np.argmax(oof_lgb, axis=1), average='macro')))
print("CV F1 score: {:<8.5f}".format(f1_score(train_level, np.argmax(oof_lgb, axis=1), average='macro')))




fold num_: 0
Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 0.0884352	valid_1's multi_logloss: 0.0883127
[100]	training's multi_logloss: 0.0446754	valid_1's multi_logloss: 0.0447331
[150]	training's multi_logloss: 0.0280478	valid_1's multi_logloss: 0.0283854
[200]	training's multi_logloss: 0.0208695	valid_1's multi_logloss: 0.0215463
[250]	training's multi_logloss: 0.0175633	valid_1's multi_logloss: 0.0187293
[300]	training's multi_logloss: 0.0159149	valid_1's multi_logloss: 0.017542
[350]	training's multi_logloss: 0.0149195	valid_1's multi_logloss: 0.0170715
[400]	training's multi_logloss: 0.0141598	valid_1's multi_logloss: 0.0170017
Early stopping, best iteration is:
[379]	training's multi_logloss: 0.0144559	valid_1's multi_logloss: 0.0169851

fold num_: 1


ValueError: y contains previously unseen labels: [2]

In [67]:
np.set_printoptions(precision=3, suppress=True)
print(eval_results)

[[0.279 0.721 0.    0.    0.    0.    0.   ]
 [0.08  0.025 0.    0.059 0.    0.837 0.   ]
 [0.931 0.001 0.    0.    0.    0.068 0.   ]]


In [68]:
results=np.argmax(lgb_pred, axis=1)
results[np.where(np.max(lgb_pred, axis=1) < 0.9)] = 7

In [69]:
from collections import Counter

Counter(results)

Counter({0: 997019, 1: 395014, 3: 12882, 5: 6253, 7: 7711, 2: 37})

In [70]:
submission_df = pd.read_csv('./dataset/sample_submission.csv')
submission_df['level'] = results

In [71]:
submission_df.head()

Unnamed: 0,id,level
0,1000000,0
1,1000001,0
2,1000002,1
3,1000003,0
4,1000004,1


In [72]:
submission_df.to_csv('./lgb_final2.csv', index=False)