In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# 파일에서 데이터 읽기
from sklearn.datasets import load_files # 여러 개의 파일에서 데이터를 읽는 도구

reviews_train = load_files('data-files/aclImdb/train')
reviews_test = load_files('data-files/aclImdb/test')

In [None]:
# 데이터 기본 정보 탐색
print( reviews_train.keys() )
print( len( reviews_train['data'] ), len( reviews_test['data'] ) )
print( np.unique(reviews_train['target'], return_counts=True) )
print( reviews_train['target_names'])
print( reviews_train['data'][0])

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
25000 25000
(array([0, 1]), array([12500, 12500]))
['neg', 'pos']
b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."


In [8]:
# 효과적인 전처리를 위해 DataFrame으로 변환
reviews_train_df = pd.DataFrame({'label': reviews_train['target'], 
                                 'review' : reviews_train['data']})
reviews_test_df = pd.DataFrame({'label': reviews_test['target'], 
                                'review' : reviews_test['data']})

In [9]:
# 데이터 프레임 확인
reviews_train_df.head()

Unnamed: 0,label,review
0,1,"b""Zero Day leads you to think, even re-think w..."
1,0,b'Words can\'t describe how bad this movie is....
2,1,b'Everyone plays their part pretty well in thi...
3,0,b'There are a lot of highly talented filmmaker...
4,0,b'I\'ve just had the evidence that confirmed m...


In [11]:
# binary text -> char text
reviews_train_df['review'] = reviews_train_df['review'].str.decode(encoding='utf-8')
reviews_test_df['review'] = reviews_test_df['review'].str.decode(encoding='utf-8')

In [12]:
# decode 확인
reviews_train_df.head()

Unnamed: 0,label,review
0,1,"Zero Day leads you to think, even re-think why..."
1,0,Words can't describe how bad this movie is. I ...
2,1,Everyone plays their part pretty well in this ...
3,0,There are a lot of highly talented filmmakers/...
4,0,I've just had the evidence that confirmed my s...


In [13]:
# 처리된 데이터 파일로 저장 ( 개별 파일에서 읽기 속도가 느려서 별도 파일로 저장 )
reviews_train_df.to_csv('data-files/imdb_reviews_train.csv', index=False)
reviews_test_df.to_csv('data-files/imdb_reviews_test.csv', index=False)

In [14]:
# 저장된 파일 테스트
pd.read_csv('data-files/imdb_reviews_train.csv').head()

Unnamed: 0,label,review
0,1,"Zero Day leads you to think, even re-think why..."
1,0,Words can't describe how bad this movie is. I ...
2,1,Everyone plays their part pretty well in this ...
3,0,There are a lot of highly talented filmmakers/...
4,0,I've just had the evidence that confirmed my s...


In [16]:
# 데이터 정보 탐색 2
reviews_train_df.info()
reviews_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   25000 non-null  int64 
 1   review  25000 non-null  object
dtypes: int64(1), object(1)
memory usage: 390.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   25000 non-null  int64 
 1   review  25000 non-null  object
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [None]:
# 데이터 전처리 : 대문자 -> 소문자

reviews_train_df['review'] = reviews_train_df['review'].str.lower()
reviews_test_df['review'] = reviews_test_df['review'].str.lower()

0        zero day leads you to think, even re-think why...
1        words can't describe how bad this movie is. i ...
2        everyone plays their part pretty well in this ...
3        there are a lot of highly talented filmmakers/...
4        i've just had the evidence that confirmed my s...
                               ...                        
24995    089: footlight parade (1933) - released 9/30/1...
24996    deeply humorous yet honest comedy about a bunc...
24997    1st watched 2/28/2006 - 4 out of 10(dir-sydney...
24998    i watch lots of scary movies (or at least they...
24999    absolutely the worst film yet by burton, who s...
Name: review, Length: 25000, dtype: object

In [None]:
# review에 markup(tag)가 포함된 것 확인 : <br />
reviews_train_df['review'][0]

"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."

In [21]:
# 데이터 전처리 2 : markup 제거
# reviews_train_df['review'] = reviews_train_df['review'].str.replace('<br />', '')

from bs4 import BeautifulSoup

# BeautifulSoup("<div>test</div>").get_text() # --> test만 추출
reviews_train_df['review'].map(lambda v: BeautifulSoup(v, 'html.parser').get_text())[0]

  reviews_train_df['review'].map(lambda v: BeautifulSoup(v, 'html.parser').get_text())[0]


"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. Flawed but honest with a terrible honesty."