In [70]:
import pandas as pd
import os
import nltk
import tqdm
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords  

from tensorflow import keras
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model, to_categorical
from sklearn.metrics import log_loss,accuracy_score

from sklearn.model_selection import StratifiedKFold

In [14]:
trn=pd.read_csv('data/train.csv',index_col=0)
tst=pd.read_csv('data/test_x.csv',index_col=0)

In [66]:
trn['author'].value_counts()

3    15063
0    13235
2    11554
4     7805
1     7222
Name: author, dtype: int64

In [22]:
trn.iloc[0].text

'He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.'

In [71]:
n_class=5
n_fold=5
seed=32152339
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

# tokenization

In [33]:
trn["token"]=trn["text"].apply(word_tokenize)

# lemmatizaion

In [36]:
lemmatizer=WordNetLemmatizer()

In [54]:
for i in range(len(trn['token'])):
    trn['token'][i]=[lemmatizer.lemmatize(t) for t in trn['token'][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# TfidfVectorizer로 token화, 불용어제거, 실수형변환

In [59]:
vec=TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'),min_df=50)
X_cnt=vec.fit_transform(trn['text'])
print(X_cnt.shape)

  'stop_words.' % sorted(inconsistent))


(54879, 2911)


In [68]:
X_cnt_tst=vec.fit_transform(tst['text'])
print(X_cnt_tst.shape)

(19617, 2379)


In [61]:
X_cnt.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.18298108,
         0.18845654],
        [0.        , 0.        , 0.        , ..., 0.08014345, 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.20204159,
         0.20808741],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

# 신경망 학습

In [72]:
def get_model():
    model = Sequential()

    model.add(Dense(units=64, activation='relu'))
    model.add(Dense(units=n_class, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')
    return model

In [78]:
X_trn,X_val,y_trn,y_val = train_test_split(X_cnt,trn['author'],test_size=0.3, random_state=seed)

In [83]:
clf = get_model()

es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5,
                   verbose=1, mode='min', baseline=None, restore_best_weights=True)

rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                        patience=3, min_lr=1e-6, mode='min', verbose=1)

clf.fit(X_trn.todense(), 
        to_categorical(y_trn),
        validation_data=(X_val.todense(), to_categorical(y_val)),
        epochs=2, # 에포크는 일반적으로 10 정도로 둠
        callbacks=[es, rlr])

Train on 38415 samples, validate on 16464 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1aeacb8b0c8>

In [84]:
p_val= clf.predict(X_val.todense())

In [101]:
len(np.argmax(p_val,axis=1))

16464

In [121]:
log_loss(y_val,p_val)

0.7205332337415213

In [119]:
accuracy_score(pd.Series(np.array(y_val)),pd.Series(np.argmax(p_val,axis=1)))

0.7298955296404276