In [1]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
stopwords_list = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
def check_missing_col(df):
    missing_col = []
    for col in df.columns:
        missing_values = sum(df[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다')
            missing_col.append([col, df[col].dtype])
        if missing_col == []:
            print('결측치가 존재하지 않습니다')
        return missing_col

In [5]:
def clean_text(texts):
    corpus = []
    for i in range(len(texts)):
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>]', '',texts[i]) #@%*=()/+ 와 같은 문장부호 제거
        review = re.sub(r'\d+','', review)#숫자 제거
        review = review.lower() #소문자 변환
        review = re.sub(r'\s+', ' ', review) #extra space 제거
        review = re.sub(r'<[^>]+>','',review) #Html tags 제거
        review = re.sub(r'\s+', ' ', review) #spaces 제거
        review = re.sub(r"^\s+", '', review) #space from start 제거
        review = re.sub(r'\s+$', '', review) #space from the end 제거
        review = re.sub(r'_', ' ', review) #space from the end 제거
        corpus.append(review) 
    return corpus

In [6]:
stopwords_list = stopwords.words('english')
wlem = nltk.WordNetLemmatizer()

In [7]:
train['text'] = clean_text(train['text'])
test['text'] = clean_text(test['text'])

In [8]:
train['tokenized'] = train['text'].apply(nltk.word_tokenize)
train['tokenized'] = train['tokenized'].apply(lambda x: [item for item in x if item not in stopwords_list])

KeyboardInterrupt: ignored

In [None]:
test['tokenized'] = test['text'].apply(nltk.word_tokenize)
test['tokenized'] = test['tokenized'].apply(lambda x: [item for item in x if item not in stopwords_list])

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(train, test_size=0.25, random_state=42)
print("훈련용 리뷰의 개수 : ", len(train_data))
print("테스트용 리뷰의 개수 : ", len(test_data))

In [None]:
X_train = train_data['tokenized'].values
y_train = train_data['target'].values
X_test = test_data['tokenized'].values
y_test = test_data['target'].values

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
threshold = 2
total_cnt = len(tokenizer.word_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0

for key, value in tokenizer.word_counts.items():
  total_freq = total_freq + value

  if (value < threshold):
    rare_cnt = rare_cnt + 1
    rare_freq = rare_freq + value

print("단어 집합(vocabulary)의 크키 : ", total_cnt)
print("등장 빈도가 %s번 이하인 희귀 단어의 수: %s"%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율: ", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율: ", (rare_freq / total_freq)*100)

In [None]:
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 : ', vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size, oov_token='OOV')
tokenizer.fit_on_texts(X_train)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# 패딩
print('리뷰의 최대 길이 :', max(len(words) for words in X_train))
print('리뷰의 평균 길이 :', sum(map(len, X_train))/len(X_train))
plt.hist([len(review) for review in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number if samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  count = 0
  for sentence in nested_list:
    if(len(sentence) <= max_len):
      count = count + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (count / len(nested_list))*100))

In [None]:
max_len = 1000
below_threshold_len(max_len, X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
onehot_Y_train = np_utils.to_categorical(y_train)

In [None]:
# vocab_size = 2000
embedding_dim = 100
# max_length = 1000
# padding_type='post'

In [None]:
model = Sequential([Embedding(vocab_size, embedding_dim, input_length=max_len),
                     tf.keras.layers.Bidirectional(LSTM(units=64, return_sequences=True)),
                     tf.keras.layers.Bidirectional(LSTM(units=64, return_sequences=True)),
                     tf.keras.layers.Bidirectional(LSTM(units=64)),
                     Dense(20, activation='softmax')]) # 결과값이 0~19 이므로 Dense(20)
model.compile(loss='categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
model.summary()

In [None]:
# 모델 실행
history3 = model.fit(X_train, onehot_Y_train, epochs=50, batch_size=100, validation_split=0.2)

In [None]:
pad_test = pad_sequences(test, maxlen=max_len)

In [None]:
test_pred = model.predict(pad_test)

In [None]:
test_pred.to_csv('test_pred.csv', index=False)

In [None]:
# 학습 결과 확인
plt.figure(figsize=(12,4))
plt.title('Model 3', fontsize=15)

plt.subplot(1, 2, 1)
plt.title('loss of Model3', fontsize=15)
plt.plot(history3.history['loss'], 'b-', label='loss')
plt.plot(history3.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('accuracy of Model3', fontsize= 15)
plt.plot(history3.history['accuracy'], 'g-', label='accuracy')
plt.plot(history3.history['val_accuracy'],'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show

In [54]:
def plot_doc_lengths(df):
  mean_seq_len = np.round(df['doc_len'].mean()).astype(int)
  sns.distplot(tuple(df['doc_len']), hist=True, kde=True, label='lengths')
  plt.axvline(x=mean_seq_len, color='k', linestyle='--', label=f'mean:{mean_seq_len}')
  plt.title('lengths')
  plt.legend()
  plt.show()
  print(f" 가장 긴 문장은 {train['doc_len'].max()} 개의 단어, 가장 짧은 문장은 {train['doc_len'].min()} 개의 단어")

In [None]:
train_data['doc_len'] = train_data['text'].apply(lambda x:len(x.split()))
plot_doc_lengths(train_data)

In [59]:
train_data['doc_len'] = train_data['text'].apply(lambda x:len(x.split()))

In [None]:
train_data

In [None]:
plot_doc_lengths(train_data)