In [1]:
import pandas
import konlpy
import gensim

print(pandas.__version__)
print(konlpy.__version__)
print(gensim.__version__)

1.3.3
0.5.2
4.1.2


In [39]:
import pandas as pd

# 데이터를 읽어봅시다. 
train_data = pd.read_table('~/aiffel/sentiment_classification/data/ratings_train.txt')
test_data = pd.read_table('~/aiffel/sentiment_classification/data/ratings_test.txt')
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])
train_data.head()

Unnamed: 0,document,label
0,아 더빙.. 진짜 짜증나네요 목소리,0
1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,너무재밓었다그래서보는것을추천한다,0
3,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [40]:
# NaN 결측치 제거
def get_null_index(df):
    null_index = df.loc[df.isnull().sum(1) > 0].index
    print(f'Number of null in data: {len(null_index)}')
    return null_index

print('Train')
train_null_index = get_null_index(train_data)
print('Test')
test_null_index = get_null_index(test_data)

train_data = train_data.drop(index=train_null_index)
test_data = test_data.drop(index=test_null_index)

print(f'Train: {train_data.shape}, Test: {test_data.shape}')

Train
Number of null in data: 5
Test
Number of null in data: 3
Train: (149995, 2), Test: (49997, 2)


In [41]:
# 데이터의 중복 제거
print('Before remove:')
print(f'Train duplicated data: {train_data.duplicated().sum()}')
print(f'Test duplicated data: {test_data.duplicated().sum()}')

train_data = train_data.loc[~train_data.duplicated()].reset_index(drop=True)
test_data = test_data.loc[~test_data.duplicated()].reset_index(drop=True)

print('After remove:')
print(f'Train duplicated data: {train_data.duplicated().sum()}')
print(f'Test duplicated data: {test_data.duplicated().sum()}')


Before remove:
Train duplicated data: 3656
Test duplicated data: 793
After remove:
Train duplicated data: 0
Test duplicated data: 0


In [42]:
from konlpy.tag import Mecab
import numpy as np
from collections import Counter

tokenizer = Mecab()
txt = '영화 재밌네요'
# 형태소로 토큰화
print(tokenizer.morphs(txt))

['영화', '재밌', '네요']


In [43]:
# tokenization
def load_data_with_tokenization(df):
    X = df['document'].apply(tokenizer.morphs)
    y = df['label'].to_numpy()
    return X, y

x_train, y_train = load_data_with_tokenization(train_data)
x_test, y_test = load_data_with_tokenization(test_data)

# create vocabulary
VOCAB2INDEX = {}
VOCAB2INDEX['[PAD]'] = 0
VOCAB2INDEX['[UNK]'] = 1
unique_tokens = set()  # to get non_duplicate tokens
for s in x_train:
    unique_tokens.update(set(s))
print(f'Number of unique tokens: {len(unique_tokens)}')

for index, token in enumerate(unique_tokens, 2):
    VOCAB2INDEX[token] = index
INDEX2VOCAB = list(VOCAB2INDEX.keys()) 

Number of unique tokens: 53979


In [None]:
# map to vocab2index
def encode(tokens):
    idx_tokens = []
    for token in tokens:
        if VOCAB2INDEX.get(token) is None:
            idx = VOCAB2INDEX['[UNK]']
        else:
            idx = VOCAB2INDEX[token]
        idx_tokens.append(idx)
    return idx_tokens

def decode(idx_tokens, keep_pad=False):
    tokens = []
    for idx in idx_tokens:
        token = INDEX2VOCAB[idx]
        if not keep_pad and token == '[PAD]':
            continue
        tokens.append(token)
    return ''.join(tokens)