In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 데이터 준비
from sklearn.datasets import load_files # 여러 개의 파일로부터 데이터를 읽는 함수

reviews_train = load_files('data-files/aclImdb/train/')
reviews_test = load_files('data-files/aclImdb/test/')

In [4]:
reviews_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [12]:
reviews_train['data']
reviews_train['target_names']
reviews_train['target']
np.unique(reviews_train['target'], return_counts=True)
np.unique(reviews_test['target'], return_counts=True)
len(reviews_train['data'])

25000

In [13]:
# DataFrame으로 구성
reviews_train_df = pd.DataFrame({'label': reviews_train['target'],
                                 'review': reviews_train['data']})
reviews_test_df = pd.DataFrame({'label': reviews_test['target'],
                                'review': reviews_test['data']})

In [15]:
reviews_train_df.head()

Unnamed: 0,label,review
0,1,"b""Zero Day leads you to think, even re-think w..."
1,0,b'Words can\'t describe how bad this movie is....
2,1,b'Everyone plays their part pretty well in thi...
3,0,b'There are a lot of highly talented filmmaker...
4,0,b'I\'ve just had the evidence that confirmed m...


In [16]:
# binary text -> char text
reviews_train_df['review2'] = reviews_train_df['review'].map(lambda v: v.decode('utf-8'))
reviews_test_df['review2'] = reviews_test_df['review'].map(lambda v: v.decode('utf-8'))

In [17]:
reviews_train_df.head()

Unnamed: 0,label,review,review2
0,1,"b""Zero Day leads you to think, even re-think w...","Zero Day leads you to think, even re-think why..."
1,0,b'Words can\'t describe how bad this movie is....,Words can't describe how bad this movie is. I ...
2,1,b'Everyone plays their part pretty well in thi...,Everyone plays their part pretty well in this ...
3,0,b'There are a lot of highly talented filmmaker...,There are a lot of highly talented filmmakers/...
4,0,b'I\'ve just had the evidence that confirmed m...,I've just had the evidence that confirmed my s...


In [18]:
# 처리된 데이터를 파일로 저장
reviews_train_df.to_csv('data-files/imdb-reviews-train.csv', encoding='utf-8', index=False)
reviews_test_df.to_csv('data-files/imdb-reviews-test.csv', encoding='utf-8', index=False)

In [19]:
pd.read_csv("data-files/imdb-reviews-train.csv").head()

Unnamed: 0,label,review,review2
0,1,"b""Zero Day leads you to think, even re-think w...","Zero Day leads you to think, even re-think why..."
1,0,b'Words can\'t describe how bad this movie is....,Words can't describe how bad this movie is. I ...
2,1,b'Everyone plays their part pretty well in thi...,Everyone plays their part pretty well in this ...
3,0,b'There are a lot of highly talented filmmaker...,There are a lot of highly talented filmmakers/...
4,0,b'I\'ve just had the evidence that confirmed m...,I've just had the evidence that confirmed my s...


In [21]:
reviews_test_df.info()
reviews_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    25000 non-null  int32 
 1   review   25000 non-null  object
 2   review2  25000 non-null  object
dtypes: int32(1), object(2)
memory usage: 488.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    25000 non-null  int32 
 1   review   25000 non-null  object
 2   review2  25000 non-null  object
dtypes: int32(1), object(2)
memory usage: 488.4+ KB


In [22]:
# 모두 소문자로 변경
reviews_train_df['review2'] = reviews_train_df['review2'].str.lower()
reviews_test_df['review2'] = reviews_test_df['review2'].str.lower()

In [23]:
# 문서 내용 확인 --> html 마크업 포함 확인 (<br />)
reviews_train_df['review2'][0]

"zero day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. it captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />it is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. in terms of explaining the motives and actions of the two young suicide/murderers it is better than 'elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />flawed but honest with a terrible honesty."

In [27]:
# reviews_train_df['review2'].str.replace('<br />', '')

from bs4 import BeautifulSoup

reviews_train_df['review2'] = \
    reviews_train_df['review2'].map(lambda v: BeautifulSoup(v, "html5lib").get_text())
reviews_test_df['review2'] = \
    reviews_test_df['review2'].map(lambda v: BeautifulSoup(v, "html5lib").get_text())

  reviews_train_df['review2'].map(lambda v: BeautifulSoup(v, "html5lib").get_text())
  reviews_test_df['review2'].map(lambda v: BeautifulSoup(v, "html5lib").get_text())


In [28]:
reviews_train_df['review2'][0]

"zero day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. it captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.it is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. in terms of explaining the motives and actions of the two young suicide/murderers it is better than 'elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. flawed but honest with a terrible honesty."

In [29]:
# 특수문자 제거 ( 영문자만 남기기 )
import re

reviews_train_df['review2'] = reviews_train_df['review2'].map(lambda v: re.sub('[^A-Za-z]', ' ', v))
reviews_test_df['review2'] = reviews_test_df['review2'].map(lambda v: re.sub('[^A-Za-z]', ' ', v))

In [30]:
reviews_train_df['review2'][0]

'zero day leads you to think  even re think why two boys young men would do what they did   commit mutual suicide via slaughtering their classmates  it captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own mutual world via coupled destruction it is not a perfect movie but given what money time the filmmaker and actors had   it is a remarkable product  in terms of explaining the motives and actions of the two young suicide murderers it is better than  elephant    in terms of being a film that gets under our  rationalistic  skin it is a far  far better film than almost anything you are likely to see  flawed but honest with a terrible honesty '

In [31]:
# 문장 -> 단어의 리스트 변환
import nltk

# reviews_train_df['review2'] = reviews_train_df['review2'].str.split(' ')
reviews_train_df['review2'] = reviews_train_df['review2'].map(nltk.word_tokenize)
reviews_test_df['review2'] = reviews_test_df['review2'].map(nltk.word_tokenize)

In [34]:
print( reviews_train_df['review2'][0][:10] )
reviews_train_df.head()

['zero', 'day', 'leads', 'you', 'to', 'think', 'even', 're', 'think', 'why']


Unnamed: 0,label,review,review2
0,1,"b""Zero Day leads you to think, even re-think w...","[zero, day, leads, you, to, think, even, re, t..."
1,0,b'Words can\'t describe how bad this movie is....,"[words, can, t, describe, how, bad, this, movi..."
2,1,b'Everyone plays their part pretty well in thi...,"[everyone, plays, their, part, pretty, well, i..."
3,0,b'There are a lot of highly talented filmmaker...,"[there, are, a, lot, of, highly, talented, fil..."
4,0,b'I\'ve just had the evidence that confirmed m...,"[i, ve, just, had, the, evidence, that, confir..."
