In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import re
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import os

In [3]:
train = pd.read_csv("data/train.csv", encoding="utf-8")
test = pd.read_csv("data/test_x.csv", encoding="utf-8")

In [4]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [5]:
test

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [6]:
# 부호 제거
def alpha_num(text):
    return re.sub(r"[^A-Za-z0-9]",' ', text)
                  
train['text']=train['text'].apply(alpha_num)

In [7]:
train['text']

0        He was almost choking  There was so much  so m...
1                    Your sister asked for it  I suppose  
2         She was engaged one day as she walked  in per...
3        The captain was in the porch  keeping himself ...
4         Have mercy  gentlemen   odin flung up his han...
                               ...                        
54874     Is that you  Mr  Smith   odin whispered   I h...
54875    I told my plan to the captain  and between us ...
54876      Your sincere well wisher  friend  and sister...
54877               Then you wanted me to lend you money  
54878    It certainly had not occurred to me before  bu...
Name: text, Length: 54879, dtype: object

In [8]:
# 불용어 제거
def remove_stopwords(text):
    final_text=[]
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [9]:
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [10]:
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

In [11]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing jane s last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said',
       ..., 'sincere well wisher friend sister lucy odin',
       'wanted lend money', 'certainly not occurred said yes like'],
      dtype='<U1429')

In [12]:
# modeling
vocab_size = 10000
embedding_dim = 10
max_length = 100
padding_type='post'
# oov_tok = "<OOV>"

In [13]:
# tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [14]:
# 데이터를 sequence로 변환해주고 padding 해준다
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [15]:
# 가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [16]:
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [17]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 10)           100000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                264       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 100,389
Trainable params: 100,389
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
num_epochs = 20
history = model.fit(train_padded, y_train, epochs=num_epochs, verbose=2, validation_split=0.2)

Epoch 1/20
1372/1372 - 1s - loss: 1.4857 - accuracy: 0.3514 - val_loss: 1.2735 - val_accuracy: 0.5517
Epoch 2/20
1372/1372 - 1s - loss: 1.0673 - accuracy: 0.6006 - val_loss: 0.9626 - val_accuracy: 0.6325
Epoch 3/20
1372/1372 - 1s - loss: 0.8552 - accuracy: 0.6808 - val_loss: 0.8585 - val_accuracy: 0.6772
Epoch 4/20
1372/1372 - 1s - loss: 0.7511 - accuracy: 0.7214 - val_loss: 0.8102 - val_accuracy: 0.6985
Epoch 5/20
1372/1372 - 1s - loss: 0.6799 - accuracy: 0.7502 - val_loss: 0.7793 - val_accuracy: 0.7128
Epoch 6/20
1372/1372 - 1s - loss: 0.6243 - accuracy: 0.7713 - val_loss: 0.7787 - val_accuracy: 0.7149
Epoch 7/20
1372/1372 - 1s - loss: 0.5792 - accuracy: 0.7919 - val_loss: 0.7461 - val_accuracy: 0.7240
Epoch 8/20
1372/1372 - 1s - loss: 0.5392 - accuracy: 0.8055 - val_loss: 0.7402 - val_accuracy: 0.7293
Epoch 9/20
1372/1372 - 1s - loss: 0.5058 - accuracy: 0.8182 - val_loss: 0.7451 - val_accuracy: 0.7267
Epoch 10/20
1372/1372 - 1s - loss: 0.4765 - accuracy: 0.8268 - val_loss: 0.7683 - 

In [None]:
def predict_prob(number):
    return [number[0],1-number[0]]
    
y_prob = np.array(list(map(predict_prob, model.predict(test_padded))))
y_prob 